Installs and Imports¶
In [8]:
import re
import string
import subprocess
import sys
import warnings
warnings.filterwarnings('ignore')
REQS = [
('pip', 'pip==24.2'),
('lightgbm', 'lightgbm==4.5.0'),
('matplotlib', 'matplotlib==3.9.2'),
('mlxtend', 'mlxtend==0.23.1'),
('nltk', 'nltk==3.9.1'),
('numpy', 'numpy==2.0.2'),
('optuna', 'optuna==4.0.0'),
('pandas', 'pandas==2.2.2'),
('seaborn', 'seaborn==0.13.2'),
('sklearn', 'scikit-learn==1.5.2'),
('statsmodels', 'statsmodels==0.14.3'),
('umap-learn', 'umap-learn==0.5.6'),
('xgboost', 'xgboost==2.1.1'),
]
try:
subprocess.check_call([sys.executable, '-m', 'ensurepip'])
except Exception as e:
print(e, file=sys.stderr)
def ensure_installed(module_info):
_, install_str = module_info
try:
subprocess.check_call([sys.executable, '-m',
'pip', 'install', '--quiet',
install_str])
print(f'Installed "{install_str}".')
except Exception as e:
print(e, file=sys.stderr)
for m in REQS:
ensure_installed(m)
# Standard libraries
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
# Machine learning and data processing
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
accuracy_score,
calinski_harabasz_score,
classification_report,
confusion_matrix,
mean_squared_error,
silhouette_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Statistical modeling
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Dimensionality reduction
import umap
# Hyperparameter optimization
import optuna
# Other machine learning libraries
import lightgbm as lgb
from xgboost import XGBClassifier
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
def find_columns_with_missing(data, columns):
"""Finding features that have a lot of missing data"""
print()
print('Finding columns with missing data...')
data_cleaned = data
missing = []
i = 0
for col in columns:
missing.append(data[col].isnull().sum())
if missing[i] > 0:
print()
print(f'Column {col} is missing {missing[i]} values.')
print(f'Proportion of missing data is {missing[i]/len(data)}.')
if missing[i]/len(data) >= 0.9:
print(f'Dropping column {col}...')
data_cleaned = data_cleaned.drop(columns=col)
i += 1
return missing, data_cleaned
def hex_to_rgb(hex_color):
"""Function to convert hex to RGB"""
# Remove the '#' if it exists
hex_color = hex_color.lstrip('#')
# Convert hex to integer and split into RGB components
return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]
def preprocess_text(text):
"""Preprocessing function"""
text = text.lower()
# Remove punctuation and special characters
text = text.translate(str.maketrans('', '', string.punctuation)) # Removes punctuation
text = re.sub(r'[^A-Za-z\s]', '', text)
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize the tokens
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Join tokens back into a string
return ' '.join(tokens)
def plot_silhouette_bar_across_experiments(model_names, silhouette_scores):
n_experiments = len(silhouette_scores)
n_models = len(model_names)
bar_width = 0.2
index = np.arange(n_experiments)
plt.figure(figsize=(12, 6))
for i, model_name in enumerate(model_names):
sil_scores = [exp_scores[i] for exp_scores in silhouette_scores]
plt.bar(index + i * bar_width,sil_scores, bar_width, label=model_name)
plt.xlabel('Experiments')
plt.ylabel('Silhouette scores')
plt.title('Silhouette scores Across Models and Experiments')
plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
plt.legend()
plt.tight_layout()
plt.show()
def visualize_ch_index_across_experiments(model_names, ch_scores):
n_experiments = len(ch_scores)
n_models = len(model_names)
bar_width = 0.2
index = np.arange(n_experiments)
plt.figure(figsize=(12, 6))
for i, model_name in enumerate(model_names):
ch_score = [exp_scores[i] for exp_scores in ch_scores]
plt.bar(index + i * bar_width, ch_score, bar_width, label=model_name)
plt.xlabel('Experiments')
plt.ylabel('Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index Across Models and Experiments')
plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
plt.legend()
plt.tight_layout()
plt.show()
class KMeansClustering:
def __init__(self, data):
self.data = data
self.best_params = None
self.kmeans_model = None
def tune_hyperparameters(self, n_trials=15):
def objective_kmeans(trial):
n_clusters = trial.suggest_int('n_clusters', 2, 10)
init_method = trial.suggest_categorical('init', ['k-means++', 'random'])
kmeans = KMeans(n_clusters=n_clusters, init=init_method, random_state=42)
kmeans.fit(self.data)
labels = kmeans.labels_
score = silhouette_score(self.data, labels)
return score
study = optuna.create_study(direction="maximize")
study.optimize(objective_kmeans, n_trials=n_trials)
self.best_params = study.best_params
print("Best params:", self.best_params)
def fit_model(self):
self.kmeans_model = KMeans(n_clusters=self.best_params['n_clusters'],
init=self.best_params['init'],
random_state=42)
self.kmeans_model.fit(self.data)
def visualize_clusters(self, umap_embedding, feature):
labels = self.kmeans_model.labels_
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Scatter plot in 3D
scatter = ax.scatter(
umap_embedding[:, 0],
umap_embedding[:, 1],
umap_embedding[:, 2],
c=labels,
cmap='viridis',
s=30
)
# Add labels and title
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
ax.set_zlabel('UMAP Dimension 3')
plt.title(f'3D UMAP of K-Means Clusters on {feature}')
# Add a color bar for better visual distinction of clusters
plt.colorbar(scatter)
# Show the plot
plt.show()
def plot_elbow_method(self, k_range=(2, 10)):
"""
Plot the Elbow Method for choosing the optimal number of clusters
Args:
- k_range: tuple, range of cluster numbers to evaluate
"""
inertia = []
K = range(k_range[0], k_range[1] + 1)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(self.data)
inertia.append(kmeans.inertia_) # Sum of squared distances to closest cluster center
plt.figure(figsize=(8, 6))
plt.plot(K, inertia, 'bo-', markersize=8)
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (Sum of squared distances)')
plt.grid(True)
plt.show()
def output_label(self):
return self.kmeans_model.labels_
def silhoutte(self):
score = silhouette_score(self.data, self.kmeans_model.labels_)
print(f'The Silhouette score is {score}')
return score
def calinski(self):
if len(np.unique(self.kmeans_model.labels_)) > 1: # Only calculate if there are clusters
score = calinski_harabasz_score(self.data, self.kmeans_model.labels_)
else:
score = np.nan # If only one cluster (or all noise), set to NaN
print(f'The Callinski index is {score}')
return score
class DBSCANClustering:
def __init__(self, data):
self.data = data
self.best_params = None
self.dbscan_model = None
def tune_hyperparameters(self, n_trials=15):
def objective_dbscan(trial):
eps = trial.suggest_float('eps', 0.1, 2.0)
min_samples = trial.suggest_int('min_samples', 3, 20)
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(self.data)
labels = dbscan.labels_
if len(set(labels)) > 1:
score = silhouette_score(self.data, labels)
else:
score = -1
return score
study = optuna.create_study(direction="maximize")
study.optimize(objective_dbscan, n_trials=n_trials)
self.best_params = study.best_params
print("Found best params:", self.best_params)
def fit_model(self):
self.dbscan_model = DBSCAN(eps=self.best_params['eps'], min_samples=self.best_params['min_samples'])
self.dbscan_model.fit(self.data)
def visualize_clusters_and_outliers_3D(self, umap_embedding, feature):
labels = self.dbscan_model.labels_
# Separate clustered points and noise points
clustered_points = umap_embedding[labels >= 0] # Points part of a cluster
clustered_labels = labels[labels >= 0]
outliers = umap_embedding[labels == -1] # Noise points
# Create a 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
# Plot the clustered points in different colors
scatter = ax.scatter(clustered_points[:, 0], clustered_points[:, 1], clustered_points[:, 2],
c=clustered_labels, cmap='viridis', s=30)
# Plot the outliers (noise points) in red with 'x' markers
ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 2], c='red', marker='x', s=80, label='Outliers')
# Add labels and title
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
ax.set_zlabel('UMAP Dimension 3')
ax.set_title(f'DBSCAN 3D Clusters with Outliers on {feature}')
# Add a legend and color bar for clusters
plt.legend()
plt.colorbar(scatter, ax=ax)
plt.show()
def output_label(self):
return self.dbscan_model.labels_
def silhoutte(self):
score = silhouette_score(self.data, self.dbscan_model.labels_)
print(f'The Silhouette score is {score}')
return score
def calinski(self):
if len(np.unique(self.dbscan_model.labels_)) > 1: # Only calculate if there are clusters
score = calinski_harabasz_score(self.data, self.dbscan_model.labels_)
else:
score = np.nan # If only one cluster (or all noise), set to NaN
print(f'The Callinski index is {score}')
return score
class ClusteringDataRetriever:
def __init__(self, data, labels):
self.data = data
self.labels = labels
def get_data_with_labels(self):
# If Data is in a numpy array, convert it to a pandas DataFrame
if isinstance(self.data, np.ndarray):
df = pd.DataFrame(self.data)
else:
df = self.data.copy() # If already a DataFrame
# Add a new column for the cluster labels
df['Cluster_Label'] = self.labels
return df[['gender', 'gender:confidence', 'Cluster_Label']]
def get_cluster_data(self, cluster_label):
# Retrieve data points belonging to a specific cluster.
df = self.get_data_with_labels()
return df[df['Cluster_Label'] == cluster_label]
def get_noise_data(self):
# Retrieve Records classified as noise (-1 label) in DBSCAN.
return self.get_cluster_data(-1)
Installed "pip==24.2". Installed "lightgbm==4.5.0". Installed "matplotlib==3.9.2". Installed "mlxtend==0.23.1". Installed "nltk==3.9.1". Installed "numpy==2.0.2". Installed "optuna==4.0.0". Installed "pandas==2.2.2". Installed "seaborn==0.13.2". Installed "scikit-learn==1.5.2". Installed "statsmodels==0.14.3". Installed "umap-learn==0.5.6". Installed "xgboost==2.1.1".
EDA¶
In [9]:
# Main starts here
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')
# Quick view of the dataset
print()
print('Dataset Overview')
print(df.info())
print(df.head())
all_features = df.columns
missing_col, df_cleaned = find_columns_with_missing(df, all_features)
# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])
# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])
# Now that we have handled the missing data, you can proceed with further analysis
print()
print('Dataset Overview')
print(df_cleaned.info())
print(df_cleaned.head())
print()
print('---- EXPLORATORY DATA ANALYSIS (EDA) ----')
current_num_features = df.select_dtypes(include=[np.number])
# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
plt.title(f'Distribution of {feature} by Gender')
plt.show()
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
plt.figure(figsize=(8, 6))
sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
plt.show()
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')
# assuming Data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days
df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()
# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()
# show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()
# Exploring 'link_color' and 'sidebar_color' features
# Check number of NaN value in 'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()
print()
print(f"Number of NaN values in 'link_color': {link_color_nan_count}.")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}.")
# Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'Number of link color is {link_color_count}.')
print(f'Number of side bar color is {sidebar_color_count}.')
# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")
# top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
# print(top_sidebar_colors)
# Extract top 10 most common sidebar colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()
# Extract top 10 most common link colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()
# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned,
order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned,
order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)
# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']
# categorise types of features
# numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()
# categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)
freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)
# gender features
# encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})
# Check for unique values in the 'gender' column after replacement
print()
print("Unique Values in 'gender'")
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()
# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)
# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)
# Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])
# Check if all required features are there
print()
print('All Remaining Features')
print(df_preprocessed.columns.tolist())
# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
# print(f'All current numerical features are {numerical_features.columns.tolist()}')
print()
print('Dataset Overview After PreProcessing')
print(df_preprocessed.info())
print()
print('---- NLP Processing ----')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
# df_preprocessed['name'].fillna('', inplace=True)
# Check the text features if they still contain NaN
print()
print(df_preprocessed.select_dtypes(include=[object]))
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
# df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))
# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())
# Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])
# Initialize TFIDF vectorizer for text features
print()
print('Applying TF-IDF Vectorisation...')
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
# Apply TF-IDF on 'description', 'text', 'name' columns
tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
# tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()
# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
# tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])
# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)
# Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])
df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)
df_asso = df_preprocessed.copy()
df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()
Dataset Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 _unit_id 20050 non-null int64
1 _golden 20050 non-null bool
2 _unit_state 20050 non-null object
3 _trusted_judgments 20050 non-null int64
4 _last_judgment_at 20000 non-null object
5 gender 19953 non-null object
6 gender:confidence 20024 non-null float64
7 profile_yn 20050 non-null object
8 profile_yn:confidence 20050 non-null float64
9 created 20050 non-null object
10 description 16306 non-null object
11 fav_number 20050 non-null int64
12 gender_gold 50 non-null object
13 link_color 20050 non-null object
14 name 20050 non-null object
15 profile_yn_gold 50 non-null object
16 profileimage 20050 non-null object
17 retweet_count 20050 non-null int64
18 sidebar_color 20050 non-null object
19 text 20050 non-null object
20 tweet_coord 159 non-null object
21 tweet_count 20050 non-null int64
22 tweet_created 20050 non-null object
23 tweet_id 20050 non-null float64
24 tweet_location 12565 non-null object
25 user_timezone 12252 non-null object
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB
None
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at \
0 815719226 False finalized 3 10/26/15 23:24
1 815719227 False finalized 3 10/26/15 23:30
2 815719228 False finalized 3 10/26/15 23:33
3 815719229 False finalized 3 10/26/15 23:10
4 815719230 False finalized 3 10/27/15 1:15
gender gender:confidence profile_yn profile_yn:confidence \
0 male 1.0000 yes 1.0
1 male 1.0000 yes 1.0
2 male 0.6625 yes 1.0
3 male 1.0000 yes 1.0
4 female 1.0000 yes 1.0
created ... profileimage \
0 12/5/13 1:48 ... https://pbs.twimg.com/profile_images/414342229...
1 10/1/12 13:51 ... https://pbs.twimg.com/profile_images/539604221...
2 11/28/14 11:30 ... https://pbs.twimg.com/profile_images/657330418...
3 6/11/09 22:39 ... https://pbs.twimg.com/profile_images/259703936...
4 4/16/14 13:23 ... https://pbs.twimg.com/profile_images/564094871...
retweet_count sidebar_color \
0 0 FFFFFF
1 0 C0DEED
2 1 C0DEED
3 0 C0DEED
4 0 0
text tweet_coord tweet_count \
0 Robbie E Responds To Critics After Win Against... NaN 110964
1 ÛÏIt felt like they were my friends and I was... NaN 7471
2 i absolutely adore when louis starts the songs... NaN 5617
3 Hi @JordanSpieth - Looking at the url - do you... NaN 1693
4 Watching Neighbours on Sky+ catching up with t... NaN 31462
tweet_created tweet_id tweet_location user_timezone
0 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai
1 10/26/15 12:40 6.587300e+17 NaN Eastern Time (US & Canada)
2 10/26/15 12:40 6.587300e+17 clcncl Belgrade
3 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada)
4 10/26/15 12:40 6.587300e+17 NaN NaN
[5 rows x 26 columns]
Finding columns with missing data...
Column _last_judgment_at is missing 50 values.
Proportion of missing data is 0.0024937655860349127.
Column gender is missing 97 values.
Proportion of missing data is 0.00483790523690773.
Column gender:confidence is missing 26 values.
Proportion of missing data is 0.0012967581047381546.
Column description is missing 3744 values.
Proportion of missing data is 0.18673316708229426.
Column gender_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column gender_gold...
Column profile_yn_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column profile_yn_gold...
Column tweet_coord is missing 19891 values.
Proportion of missing data is 0.992069825436409.
Dropping column tweet_coord...
Column tweet_location is missing 7485 values.
Proportion of missing data is 0.3733167082294264.
Column user_timezone is missing 7798 values.
Proportion of missing data is 0.388927680798005.
Dataset Overview
<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 _unit_id 19953 non-null int64
1 _golden 19953 non-null bool
2 _unit_state 19953 non-null object
3 _trusted_judgments 19953 non-null int64
4 _last_judgment_at 19903 non-null object
5 gender 19953 non-null object
6 gender:confidence 19953 non-null float64
7 profile_yn:confidence 19953 non-null float64
8 created 19953 non-null object
9 description 16224 non-null object
10 fav_number 19953 non-null int64
11 link_color 19953 non-null object
12 name 19953 non-null object
13 profileimage 19953 non-null object
14 retweet_count 19953 non-null int64
15 sidebar_color 19953 non-null object
16 text 19953 non-null object
17 tweet_count 19953 non-null int64
18 tweet_created 19953 non-null object
19 tweet_id 19953 non-null float64
20 tweet_location 12510 non-null object
21 user_timezone 12185 non-null object
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB
None
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at \
0 815719226 False finalized 3 10/26/15 23:24
1 815719227 False finalized 3 10/26/15 23:30
2 815719228 False finalized 3 10/26/15 23:33
3 815719229 False finalized 3 10/26/15 23:10
4 815719230 False finalized 3 10/27/15 1:15
gender gender:confidence profile_yn:confidence created \
0 male 1.0000 1.0 12/5/13 1:48
1 male 1.0000 1.0 10/1/12 13:51
2 male 0.6625 1.0 11/28/14 11:30
3 male 1.0000 1.0 6/11/09 22:39
4 female 1.0000 1.0 4/16/14 13:23
description ... name \
0 i sing my own rhythm. ... sheezy0
1 I'm the author of novels filled with family dr... ... DavdBurnett
2 louis whining and squealing and all ... lwtprettylaugh
3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... ... douggarland
4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... WilfordGemma
profileimage retweet_count \
0 https://pbs.twimg.com/profile_images/414342229... 0
1 https://pbs.twimg.com/profile_images/539604221... 0
2 https://pbs.twimg.com/profile_images/657330418... 1
3 https://pbs.twimg.com/profile_images/259703936... 0
4 https://pbs.twimg.com/profile_images/564094871... 0
sidebar_color text \
0 FFFFFF Robbie E Responds To Critics After Win Against...
1 C0DEED ÛÏIt felt like they were my friends and I was...
2 C0DEED i absolutely adore when louis starts the songs...
3 C0DEED Hi @JordanSpieth - Looking at the url - do you...
4 0 Watching Neighbours on Sky+ catching up with t...
tweet_count tweet_created tweet_id tweet_location \
0 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3
1 7471 10/26/15 12:40 6.587300e+17 NaN
2 5617 10/26/15 12:40 6.587300e+17 clcncl
3 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA
4 31462 10/26/15 12:40 6.587300e+17 NaN
user_timezone
0 Chennai
1 Eastern Time (US & Canada)
2 Belgrade
3 Pacific Time (US & Canada)
4 NaN
[5 rows x 22 columns]
---- EXPLORATORY DATA ANALYSIS (EDA) ----
Number of NaN values in 'link_color': 0. Number of NaN values in 'sidebar_color': 0. Number of link color is 2986. Number of side bar color is 559. Number of NaN values in 'link_color': 0 Number of NaN values in 'sidebar_color': 0
Unique Values in 'gender' [0 1 2] <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 link_color 18836 non-null object 5 retweets_per_day 18836 non-null float64 6 sidebar_color 18836 non-null object 7 text 18836 non-null object 8 tweets_per_day 18836 non-null float64 9 user_timezone 18836 non-null object 10 tweet_location 18836 non-null object 11 profile_created_year 18836 non-null int32 12 tweet_created_year 18836 non-null int32 13 tweet_location_encoded 18836 non-null float64 14 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(6) memory usage: 2.2+ MB None
All Remaining Features
['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded']
Dataset Overview After PreProcessing
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 18836 non-null int64
1 gender:confidence 18836 non-null float64
2 description 15522 non-null object
3 favorites_per_day 18836 non-null float64
4 retweets_per_day 18836 non-null float64
5 text 18836 non-null object
6 tweets_per_day 18836 non-null float64
7 profile_created_year 18836 non-null int32
8 tweet_created_year 18836 non-null int32
9 tweet_location_encoded 18836 non-null float64
10 user_timezone_encoded 18836 non-null float64
dtypes: float64(6), int32(2), int64(1), object(2)
memory usage: 1.6+ MB
None
---- NLP Processing ----
description \
0 i sing my own rhythm.
1 I'm the author of novels filled with family dr...
2 louis whining and squealing and all
3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe...
4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
... ...
20045 (rp)
20046 Whatever you like, it's not a problem at all. ...
20047 #TeamBarcelona ..You look lost so you should f...
20048 Anti-statist; I homeschool my kids. Aspiring t...
20049 Teamwork makes the dream work.
text
0 Robbie E Responds To Critics After Win Against...
1 ÛÏIt felt like they were my friends and I was...
2 i absolutely adore when louis starts the songs...
3 Hi @JordanSpieth - Looking at the url - do you...
4 Watching Neighbours on Sky+ catching up with t...
... ...
20045 @lookupondeath ...Fine, and I'll drink tea too...
20046 Greg Hardy you a good player and all but don't...
20047 You can miss people and still never want to se...
20048 @bitemyapp i had noticed your tendency to pee ...
20049 I think for my APUSH creative project I'm goin...
[18836 rows x 2 columns]
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package punkt_tab to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package punkt_tab is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
description \
0 i sing my own rhythm.
1 I'm the author of novels filled with family dr...
2 louis whining and squealing and all
3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe...
4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
cleaned_description \
0 sing rhythm
1 im author novel filled family drama romance
2 louis whining squealing
3 mobile guy er shazam google kleiner perkins ya...
4 ricky wilson best frontmankaiser chief best ba...
text \
0 Robbie E Responds To Critics After Win Against...
1 ÛÏIt felt like they were my friends and I was...
2 i absolutely adore when louis starts the songs...
3 Hi @JordanSpieth - Looking at the url - do you...
4 Watching Neighbours on Sky+ catching up with t...
cleaned_text
0 robbie e responds critic win eddie edward worl...
1 felt like friend living story httpstcoarngeyhn...
2 absolutely adore louis start song hit hard fee...
3 hi jordanspieth looking url use ifttt dont typ...
4 watching neighbour sky catching neighbs xxx xxx
Applying TF-IDF Vectorisation...
CLUSTERING¶
In [2]:
print()
print()
print('---- CLUSTERING MODELS ----')
print()
print("=" * 50)
print('EXP 1: USING ALL SELECTED FEATURES')
print("=" * 50)
sil_ex1 = []
cal_ex1 = []
# Drop the gender and categorical features before normalise
df_cat = df_cate.copy()
# Drop gender feature and categorical features
df_preprocessed = df_preprocessed.drop(columns=df_cat.columns)
df_finalised = df_preprocessed.drop(columns=['gender', 'gender:confidence'])
# Normalise every existing feature
scaler = StandardScaler()
df_finalised = pd.DataFrame(scaler.fit_transform(df_finalised), columns=df_finalised.columns)
df_finalised = pd.concat([df_finalised, df_cat, df_gender], axis=1)
# find the rows that contained NaN values and drop them
df_finalised = df_finalised.dropna()
data_exp1 = df_finalised
df_ex1 = df_finalised.drop(columns=['gender', 'gender:confidence'])
# Check the preprocessed dataset in the present
print()
print('Dataset for Exp 1')
print(df_ex1.info())
print()
# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_vis = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(df_ex1)
umap_plot = umap_vis.fit_transform(df_ex1)
print(umap_embedding.shape)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding)
kmeans_clustering.tune_hyperparameters()
kmeans_exp1 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_plot, 'All feature types')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex1.append(kmeans_clustering.silhoutte())
cal_ex1.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(data_exp1, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 1')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 1')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding)
dbscan_clustering.tune_hyperparameters()
dbscan_exp1 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_plot, 'All feature types')
db_labels = dbscan_clustering.output_label()
sil_ex1.append(dbscan_clustering.silhoutte())
cal_ex1.append(dbscan_clustering.calinski())
# Initialize the class to retrieve data
db_retriever = ClusteringDataRetriever(data_exp1, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 1')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 1')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print("=" * 50)
print('EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES')
print("=" * 50)
sil_ex2 = []
cal_ex2 = []
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_num.shape[0], chunk_size):
df_num.iloc[i:i + chunk_size] = scaler.fit_transform(df_num.iloc[i:i + chunk_size])
df_no_text = pd.concat([df_num, df_cate, df_gender], axis=1)
print()
print("Data with Only Numerical and Categorical Features")
print(df_no_text.info())
print()
df_no_text = df_no_text.dropna()
df_no_text_wg = df_no_text.copy()
print('Removing NaN values...')
# Drop gender feature before clustering
data_exp2 = df_no_text.drop(columns=['gender', 'gender:confidence'])
print('Dropping gender and gender:confidence...')
# Check No. of records after drop NaN values
print()
print("Dataset for Exp 2")
print(data_exp2.info())
print()
print(data_exp2.head())
# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(data_exp2)
print(umap_embedding.shape)
# umap_embedding = umap_embedding.astype(np.float32)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(data_exp2)
kmeans_clustering.tune_hyperparameters()
kmeans_exp2 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Numerical and categorical features') # Visualize clusters
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex2.append(kmeans_clustering.silhoutte())
cal_ex2.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(df_no_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 2')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 2')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(data_exp2)
dbscan_clustering.tune_hyperparameters() # Tune DBSCAN hyperparameters
dbscan_exp2 = dbscan_clustering.fit_model() # Fit the DBSCAN model
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'numerical and categorical features') # Plot 3D noise points and valid clusters
db_labels = dbscan_clustering.output_label()
sil_ex2.append(dbscan_clustering.silhoutte())
cal_ex2.append(dbscan_clustering.calinski())
db_retriever = ClusteringDataRetriever(df_no_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 2')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 2')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print("=" * 50)
print('EXP 3: USING ONLY TEXT FEATURES')
print("=" * 50)
sil_ex3 = []
cal_ex3 = []
# Merge with main dataframe
df_with_text = pd.concat([tfidf_desc_df, tfidf_text_df], axis=1)
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_with_text.shape[0], chunk_size):
df_with_text.iloc[i:i + chunk_size] = scaler.fit_transform(df_with_text.iloc[i:i + chunk_size])
df_with_text_wg = pd.concat([df_with_text, df_gender], axis=1)
# Drop NaN values before clustering
df_with_text_wg = df_with_text_wg.dropna()
data_exp3 = df_with_text_wg.drop(columns=['gender', 'gender:confidence'])
# Drop the gender features before clustering
print('Dataset for Exp 3')
print(data_exp3.info())
print()
print(data_exp3.head())
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_embedding_t = umap_model.fit_transform(data_exp3)
umap_embedding = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42).fit_transform(data_exp3)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding_t)
kmeans_clustering.tune_hyperparameters()
kmeans_exp3 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Text features')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex3.append(kmeans_clustering.silhoutte())
cal_ex3.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(df_with_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 3')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 3')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCANClustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding_t)
dbscan_clustering.tune_hyperparameters()
dbscan_exp3 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'Text features')
db_labels = dbscan_clustering.output_label()
sil_ex3.append(dbscan_clustering.silhoutte())
cal_ex3.append(dbscan_clustering.calinski())
db_retriever = ClusteringDataRetriever(df_with_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 3')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 3')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print('---- VISUALIZE THE METRIC EVALUATION ----')
# Metric functions
model_names = ['KMeans', 'DBSCAN']
sil_scores = [sil_ex1, sil_ex2, sil_ex3]
cal_scores = [cal_ex1, cal_ex2, cal_ex3]
plot_silhouette_bar_across_experiments(model_names, sil_scores)
visualize_ch_index_across_experiments(model_names, cal_scores)
---- CLUSTERING MODELS ---- ================================================== EXP 1: USING ALL SELECTED FEATURES ================================================== Dataset for Exp 1 <class 'pandas.core.frame.DataFrame'> Index: 17702 entries, 0 to 18835 Columns: 3013 entries, favorites_per_day to user_timezone_encoded dtypes: float64(3013) memory usage: 407.1 MB None Applying UMAP for dim reduction...
[I 2024-09-19 17:45:49,716] A new study created in memory with name: no-name-33c6aaa9-9d16-4038-a5a4-495c225843be
(17702, 2) Performing K-Means Clustering...
[I 2024-09-19 17:45:54,957] Trial 0 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:45:59,799] Trial 1 finished with value: 0.4471836984157562 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:04,381] Trial 2 finished with value: 0.3998541533946991 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:09,241] Trial 3 finished with value: 0.4395274221897125 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:13,853] Trial 4 finished with value: 0.41091856360435486 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:18,491] Trial 5 finished with value: 0.4032902717590332 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:23,372] Trial 6 finished with value: 0.43896961212158203 and parameters: {'n_clusters': 3, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:28,029] Trial 7 finished with value: 0.39738479256629944 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:32,695] Trial 8 finished with value: 0.42352718114852905 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:37,844] Trial 9 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:42,705] Trial 10 finished with value: 0.4405798316001892 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:47,858] Trial 11 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:53,026] Trial 12 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:57,961] Trial 13 finished with value: 0.4405798316001892 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:47:02,826] Trial 14 finished with value: 0.448025643825531 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
Best params: {'n_clusters': 2, 'init': 'k-means++'}
The Silhouette score is 0.8162233233451843
The Callinski index is 53467.58203125
Dataset with Labels from KMeans in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from KMeans in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[15998 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5318
No. of records with gender 1 in cluster 0 is 5676
No. of records with gender 2 in cluster 0 is 5004
Records found in cluster 1 from KMeans in Exp 1
[I 2024-09-19 17:47:09,518] A new study created in memory with name: no-name-de4af118-ab6a-4e35-b187-5867231e4373
gender gender:confidence Cluster_Label 7 0.0 1.0000 1 33 0.0 1.0000 1 49 2.0 1.0000 1 56 1.0 0.6684 1 58 0.0 1.0000 1 ... ... ... ... 18731 1.0 1.0000 1 18738 2.0 1.0000 1 18753 0.0 0.6678 1 18789 0.0 1.0000 1 18803 1.0 1.0000 1 [1704 rows x 3 columns] No. of records with gender 0 in cluster 1 is 525 No. of records with gender 1 in cluster 1 is 525 No. of records with gender 2 in cluster 1 is 654 Performing DBSCAN Clustering...
[I 2024-09-19 17:47:15,062] Trial 0 finished with value: 0.44018059968948364 and parameters: {'eps': 0.6825546761974374, 'min_samples': 12}. Best is trial 0 with value: 0.44018059968948364.
[I 2024-09-19 17:47:20,930] Trial 1 finished with value: 0.549675703048706 and parameters: {'eps': 1.0905561932063113, 'min_samples': 5}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:26,373] Trial 2 finished with value: 0.4102407693862915 and parameters: {'eps': 0.4933035459866175, 'min_samples': 17}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:32,394] Trial 3 finished with value: 0.5234434604644775 and parameters: {'eps': 1.3809801879490313, 'min_samples': 11}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:38,584] Trial 4 finished with value: 0.5330817103385925 and parameters: {'eps': 1.7208744440250079, 'min_samples': 5}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:43,984] Trial 5 finished with value: 0.03209728002548218 and parameters: {'eps': 0.41497364231347533, 'min_samples': 11}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:49,391] Trial 6 finished with value: 0.0401240736246109 and parameters: {'eps': 0.4164881924972801, 'min_samples': 9}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:54,869] Trial 7 finished with value: 0.463115930557251 and parameters: {'eps': 0.5467979615960828, 'min_samples': 12}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:48:00,018] Trial 8 finished with value: -0.3428648114204407 and parameters: {'eps': 0.14310863827181103, 'min_samples': 14}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:48:06,007] Trial 9 finished with value: 0.5529610514640808 and parameters: {'eps': 1.2952162564617473, 'min_samples': 20}. Best is trial 9 with value: 0.5529610514640808.
[I 2024-09-19 17:48:12,367] Trial 10 finished with value: 0.5906190872192383 and parameters: {'eps': 1.9669814522582851, 'min_samples': 19}. Best is trial 10 with value: 0.5906190872192383.
[I 2024-09-19 17:48:18,750] Trial 11 finished with value: 0.5906190872192383 and parameters: {'eps': 1.9611053257653026, 'min_samples': 20}. Best is trial 10 with value: 0.5906190872192383.
[I 2024-09-19 17:48:25,097] Trial 12 finished with value: 0.5911170840263367 and parameters: {'eps': 1.9300803447293904, 'min_samples': 20}. Best is trial 12 with value: 0.5911170840263367.
[I 2024-09-19 17:48:31,496] Trial 13 finished with value: 0.5892439484596252 and parameters: {'eps': 1.9952355283145653, 'min_samples': 17}. Best is trial 12 with value: 0.5911170840263367.
[I 2024-09-19 17:48:37,700] Trial 14 finished with value: 0.5957051515579224 and parameters: {'eps': 1.68208869635505, 'min_samples': 17}. Best is trial 14 with value: 0.5957051515579224.
Found best params: {'eps': 1.68208869635505, 'min_samples': 17}
The Silhouette score is 0.5957051515579224
The Callinski index is 3768.14453125
Dataset with Labels from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[15909 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5293
No. of records with gender 1 in cluster 0 is 5650
No. of records with gender 2 in cluster 0 is 4966
Records found in cluster 1 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7 0.0 1.0000 1
33 0.0 1.0000 1
49 2.0 1.0000 1
56 1.0 0.6684 1
58 0.0 1.0000 1
132 1.0 1.0000 1
153 2.0 1.0000 1
191 2.0 0.6804 1
192 0.0 1.0000 1
199 1.0 1.0000 1
231 1.0 1.0000 1
243 0.0 1.0000 1
250 2.0 1.0000 1
288 1.0 0.6494 1
308 1.0 0.6752 1
390 1.0 0.6786 1
460 2.0 0.6708 1
503 0.0 1.0000 1
No. of records with gender 0 in cluster 1 is 6
No. of records with gender 1 in cluster 1 is 7
No. of records with gender 2 in cluster 1 is 5
Records found in cluster 2 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
282 1.0 1.0000 2
302 1.0 1.0000 2
1402 0.0 0.3539 2
1544 0.0 1.0000 2
2154 1.0 0.6561 2
2347 2.0 0.6757 2
2929 0.0 1.0000 2
2964 1.0 1.0000 2
3229 0.0 1.0000 2
3341 1.0 1.0000 2
3770 0.0 1.0000 2
3938 2.0 0.6545 2
4650 2.0 0.3571 2
5206 1.0 1.0000 2
5367 0.0 1.0000 2
5424 0.0 1.0000 2
5629 2.0 1.0000 2
5634 2.0 0.6840 2
5640 0.0 1.0000 2
5944 1.0 1.0000 2
6093 1.0 0.6653 2
6157 2.0 0.6567 2
6174 2.0 0.6619 2
6313 1.0 1.0000 2
6409 0.0 1.0000 2
6514 1.0 1.0000 2
7625 0.0 1.0000 2
8798 1.0 1.0000 2
13356 1.0 1.0000 2
No. of records with gender 0 in cluster 2 is 10
No. of records with gender 1 in cluster 2 is 12
No. of records with gender 2 in cluster 2 is 7
Records found in cluster 3 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
426 2.0 1.0000 3
431 0.0 0.6631 3
432 0.0 1.0000 3
1992 0.0 1.0000 3
2776 0.0 1.0000 3
3755 2.0 1.0000 3
3769 2.0 0.6497 3
3784 2.0 1.0000 3
4374 2.0 1.0000 3
4418 1.0 1.0000 3
4456 1.0 1.0000 3
4653 2.0 1.0000 3
4995 2.0 1.0000 3
5008 2.0 1.0000 3
5044 2.0 1.0000 3
5196 1.0 1.0000 3
5220 2.0 0.6650 3
5352 1.0 1.0000 3
5372 2.0 1.0000 3
5533 2.0 1.0000 3
5580 0.0 1.0000 3
5596 2.0 1.0000 3
5627 2.0 0.6559 3
5662 1.0 1.0000 3
5749 2.0 1.0000 3
5919 2.0 1.0000 3
5988 2.0 1.0000 3
6208 1.0 0.6543 3
6496 2.0 0.6716 3
6669 0.0 1.0000 3
7060 1.0 0.6890 3
7261 0.0 1.0000 3
7439 0.0 1.0000 3
7683 1.0 0.6699 3
7702 2.0 0.7012 3
7771 2.0 1.0000 3
7894 0.0 1.0000 3
7898 2.0 1.0000 3
7902 0.0 1.0000 3
8120 1.0 1.0000 3
8248 1.0 1.0000 3
8295 2.0 0.6579 3
8360 2.0 0.6854 3
8408 0.0 1.0000 3
8933 1.0 1.0000 3
8984 2.0 0.6890 3
9100 0.0 1.0000 3
9341 2.0 1.0000 3
9379 0.0 1.0000 3
10138 1.0 1.0000 3
10451 0.0 0.6824 3
13349 0.0 1.0000 3
14425 0.0 0.6628 3
14668 2.0 1.0000 3
16449 1.0 1.0000 3
16881 1.0 0.6733 3
No. of records with gender 0 in cluster 3 is 16
No. of records with gender 1 in cluster 3 is 14
No. of records with gender 2 in cluster 3 is 26
Records found in cluster 4 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
502 0.0 1.0000 4
578 1.0 1.0000 4
644 0.0 1.0000 4
771 0.0 1.0000 4
963 2.0 1.0000 4
... ... ... ...
9150 1.0 1.0000 4
9165 0.0 1.0000 4
9216 2.0 0.6519 4
9221 2.0 1.0000 4
9243 0.0 0.3506 4
[175 rows x 3 columns]
No. of records with gender 0 in cluster 4 is 52
No. of records with gender 1 in cluster 4 is 52
No. of records with gender 2 in cluster 4 is 71
Records found in cluster 5 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
513 2.0 1.0000 5
514 0.0 1.0000 5
520 0.0 0.3458 5
553 0.0 1.0000 5
554 0.0 0.3431 5
555 0.0 1.0000 5
556 0.0 1.0000 5
557 0.0 1.0000 5
560 1.0 1.0000 5
564 1.0 1.0000 5
565 1.0 1.0000 5
566 2.0 0.6829 5
576 0.0 1.0000 5
577 2.0 1.0000 5
1102 1.0 0.6777 5
2660 0.0 0.3478 5
7995 2.0 1.0000 5
8037 0.0 0.6374 5
8233 0.0 1.0000 5
10824 0.0 1.0000 5
No. of records with gender 0 in cluster 5 is 12
No. of records with gender 1 in cluster 5 is 4
No. of records with gender 2 in cluster 5 is 4
Records found in cluster 6 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
570 2.0 0.6616 6
2860 0.0 1.0000 6
2862 0.0 1.0000 6
2863 0.0 0.3370 6
2866 2.0 0.6497 6
2870 2.0 0.6368 6
2872 0.0 0.6855 6
2873 1.0 0.6940 6
2996 1.0 1.0000 6
3168 1.0 1.0000 6
4767 1.0 0.6774 6
5853 2.0 0.6619 6
8255 2.0 0.6672 6
9773 0.0 0.6607 6
10211 1.0 1.0000 6
10698 1.0 0.6795 6
11317 2.0 1.0000 6
11909 1.0 1.0000 6
12736 1.0 0.6619 6
14216 1.0 1.0000 6
14307 2.0 0.6617 6
14448 0.0 1.0000 6
14613 0.0 1.0000 6
14791 1.0 1.0000 6
15015 1.0 1.0000 6
15216 0.0 1.0000 6
15333 1.0 1.0000 6
15424 0.0 0.6608 6
15800 1.0 1.0000 6
16873 1.0 1.0000 6
17596 1.0 1.0000 6
18337 1.0 1.0000 6
No. of records with gender 0 in cluster 6 is 9
No. of records with gender 1 in cluster 6 is 16
No. of records with gender 2 in cluster 6 is 7
Records found in cluster 7 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
575 0.0 1.0000 7
1308 0.0 0.6479 7
2033 1.0 1.0000 7
2308 1.0 0.6774 7
3898 0.0 1.0000 7
5454 2.0 0.6774 7
5539 1.0 1.0000 7
5628 2.0 1.0000 7
5825 1.0 1.0000 7
5847 2.0 0.6717 7
6012 0.0 1.0000 7
6048 2.0 0.6796 7
6108 0.0 1.0000 7
6114 1.0 0.6620 7
6335 2.0 1.0000 7
6382 2.0 0.6842 7
6417 2.0 1.0000 7
7843 2.0 1.0000 7
8181 0.0 1.0000 7
8355 2.0 0.6778 7
8738 0.0 1.0000 7
No. of records with gender 0 in cluster 7 is 7
No. of records with gender 1 in cluster 7 is 5
No. of records with gender 2 in cluster 7 is 9
Records found in cluster 8 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
599 1.0 1.0000 8
1268 2.0 1.0000 8
2682 1.0 0.6473 8
3360 1.0 1.0000 8
5548 2.0 1.0000 8
6616 1.0 1.0000 8
7610 2.0 0.6578 8
8509 2.0 0.6731 8
9305 2.0 0.6606 8
9515 0.0 0.6648 8
10396 1.0 1.0000 8
10608 1.0 1.0000 8
10796 0.0 0.6912 8
10981 0.0 1.0000 8
11477 2.0 1.0000 8
11770 2.0 1.0000 8
12451 2.0 1.0000 8
12803 1.0 0.6667 8
12996 1.0 1.0000 8
13263 2.0 0.6743 8
13436 0.0 1.0000 8
14141 0.0 1.0000 8
14290 0.0 1.0000 8
14473 0.0 1.0000 8
14878 2.0 0.6502 8
15088 0.0 0.6581 8
15727 2.0 1.0000 8
16605 0.0 0.6578 8
16973 0.0 1.0000 8
17197 1.0 1.0000 8
17330 0.0 1.0000 8
17728 1.0 0.6702 8
18071 2.0 1.0000 8
18531 2.0 1.0000 8
No. of records with gender 0 in cluster 8 is 11
No. of records with gender 1 in cluster 8 is 10
No. of records with gender 2 in cluster 8 is 13
Records found in cluster 9 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
725 0.0 1.0000 9
1203 1.0 1.0000 9
1240 1.0 0.6889 9
2115 0.0 1.0000 9
2381 0.0 1.0000 9
3988 2.0 1.0000 9
5994 2.0 0.6611 9
7988 1.0 0.6734 9
8071 1.0 1.0000 9
10735 0.0 1.0000 9
10738 0.0 1.0000 9
11076 2.0 1.0000 9
11179 2.0 1.0000 9
11484 1.0 1.0000 9
11648 1.0 1.0000 9
11746 0.0 1.0000 9
12054 1.0 1.0000 9
13078 0.0 1.0000 9
14056 2.0 1.0000 9
15064 0.0 0.6534 9
15751 1.0 1.0000 9
15757 1.0 1.0000 9
16465 0.0 1.0000 9
16868 1.0 1.0000 9
17448 0.0 1.0000 9
18208 0.0 1.0000 9
18753 0.0 0.6678 9
No. of records with gender 0 in cluster 9 is 12
No. of records with gender 1 in cluster 9 is 10
No. of records with gender 2 in cluster 9 is 5
Records found in cluster 10 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
822 0.0 0.6473 10
1536 2.0 0.6591 10
2971 1.0 1.0000 10
10714 0.0 1.0000 10
11119 1.0 1.0000 10
11627 2.0 0.6796 10
11727 2.0 1.0000 10
12324 1.0 1.0000 10
12333 1.0 1.0000 10
12992 0.0 1.0000 10
13486 2.0 1.0000 10
13980 0.0 1.0000 10
14046 0.0 1.0000 10
14170 1.0 1.0000 10
14958 2.0 1.0000 10
15223 0.0 1.0000 10
15597 1.0 0.3362 10
15889 2.0 0.3383 10
16706 0.0 1.0000 10
16735 0.0 0.6563 10
17090 0.0 1.0000 10
17186 1.0 1.0000 10
17599 0.0 0.6654 10
18270 0.0 1.0000 10
No. of records with gender 0 in cluster 10 is 11
No. of records with gender 1 in cluster 10 is 7
No. of records with gender 2 in cluster 10 is 6
Records found in cluster 11 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1040 1.0 1.0000 11
1045 2.0 0.6789 11
1049 1.0 1.0000 11
1051 2.0 1.0000 11
1052 1.0 1.0000 11
1054 1.0 1.0000 11
1061 0.0 1.0000 11
1064 1.0 0.6498 11
1065 0.0 1.0000 11
3581 0.0 1.0000 11
3705 2.0 0.6581 11
3809 2.0 1.0000 11
3906 1.0 0.6422 11
4041 0.0 1.0000 11
4156 1.0 1.0000 11
4272 2.0 1.0000 11
4341 0.0 1.0000 11
4410 2.0 1.0000 11
4508 1.0 1.0000 11
4631 2.0 1.0000 11
4736 2.0 1.0000 11
4840 2.0 1.0000 11
5305 1.0 1.0000 11
No. of records with gender 0 in cluster 11 is 5
No. of records with gender 1 in cluster 11 is 9
No. of records with gender 2 in cluster 11 is 9
Records found in cluster 12 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1108 1.0 0.6880 12
9382 2.0 1.0000 12
9398 1.0 1.0000 12
9475 0.0 1.0000 12
9496 0.0 1.0000 12
... ... ... ...
15207 1.0 1.0000 12
15391 2.0 1.0000 12
15439 2.0 1.0000 12
15622 2.0 1.0000 12
18398 0.0 0.6709 12
[70 rows x 3 columns]
No. of records with gender 0 in cluster 12 is 19
No. of records with gender 1 in cluster 12 is 25
No. of records with gender 2 in cluster 12 is 26
Records found in cluster 13 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1273 0.0 1.0000 13
1605 2.0 1.0000 13
1761 2.0 1.0000 13
1845 1.0 1.0000 13
1987 1.0 1.0000 13
2274 0.0 1.0000 13
3723 1.0 1.0000 13
3961 0.0 1.0000 13
4092 0.0 0.3411 13
4424 2.0 1.0000 13
4898 0.0 1.0000 13
5218 2.0 1.0000 13
5276 2.0 0.6632 13
5336 1.0 1.0000 13
5379 0.0 1.0000 13
5445 0.0 1.0000 13
5536 2.0 0.6943 13
5927 2.0 0.6721 13
5949 1.0 0.6848 13
5980 0.0 1.0000 13
6017 1.0 0.3486 13
6245 2.0 1.0000 13
6262 2.0 1.0000 13
6289 1.0 1.0000 13
6298 0.0 1.0000 13
6466 2.0 1.0000 13
7003 1.0 1.0000 13
7118 2.0 1.0000 13
7431 1.0 1.0000 13
7540 0.0 0.6859 13
7791 1.0 1.0000 13
8142 2.0 1.0000 13
8601 2.0 0.6700 13
8693 0.0 1.0000 13
9023 1.0 0.6654 13
9265 1.0 1.0000 13
15378 1.0 1.0000 13
No. of records with gender 0 in cluster 13 is 11
No. of records with gender 1 in cluster 13 is 13
No. of records with gender 2 in cluster 13 is 13
Records found in cluster 14 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
2138 1.0 1.0000 14
2145 0.0 1.0000 14
2146 1.0 1.0000 14
2147 1.0 1.0000 14
2148 1.0 0.3576 14
2156 0.0 1.0000 14
2166 1.0 1.0000 14
2168 0.0 0.6825 14
2169 1.0 1.0000 14
2171 1.0 1.0000 14
2172 0.0 1.0000 14
2182 2.0 1.0000 14
2185 0.0 1.0000 14
2186 0.0 0.3403 14
2187 1.0 1.0000 14
2188 2.0 0.6812 14
2189 0.0 0.6582 14
2191 0.0 1.0000 14
2194 1.0 1.0000 14
2196 1.0 1.0000 14
2204 1.0 0.6587 14
2205 0.0 0.6685 14
2206 1.0 0.6551 14
2207 1.0 1.0000 14
2210 1.0 1.0000 14
2216 1.0 0.6896 14
2217 1.0 0.6832 14
2220 1.0 1.0000 14
2223 2.0 1.0000 14
No. of records with gender 0 in cluster 14 is 9
No. of records with gender 1 in cluster 14 is 17
No. of records with gender 2 in cluster 14 is 3
Records found in cluster 15 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
2445 1.0 1.0000 15
4210 2.0 1.0000 15
11871 1.0 1.0000 15
14380 0.0 0.3398 15
14935 2.0 0.6634 15
14972 1.0 0.6475 15
15079 2.0 1.0000 15
15173 0.0 1.0000 15
15186 0.0 1.0000 15
15228 1.0 1.0000 15
15231 0.0 1.0000 15
15234 0.0 1.0000 15
15236 1.0 1.0000 15
15278 2.0 1.0000 15
15287 1.0 0.6880 15
15288 2.0 1.0000 15
15292 2.0 1.0000 15
15295 2.0 1.0000 15
15313 2.0 1.0000 15
15316 2.0 1.0000 15
15322 0.0 1.0000 15
15324 2.0 0.6344 15
15338 1.0 0.6791 15
No. of records with gender 0 in cluster 15 is 6
No. of records with gender 1 in cluster 15 is 7
No. of records with gender 2 in cluster 15 is 10
Records found in cluster 16 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
2475 0.0 1.0 16
4315 1.0 1.0 16
5147 1.0 1.0 16
9579 2.0 1.0 16
17729 0.0 1.0 16
... ... ... ...
18371 2.0 1.0 16
18372 2.0 1.0 16
18373 0.0 1.0 16
18374 1.0 1.0 16
18375 0.0 1.0 16
[156 rows x 3 columns]
No. of records with gender 0 in cluster 16 is 67
No. of records with gender 1 in cluster 16 is 60
No. of records with gender 2 in cluster 16 is 29
Records found in cluster 17 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
3385 1.0 1.0000 17
3386 1.0 0.6628 17
3388 2.0 1.0000 17
3391 0.0 0.6612 17
3393 1.0 1.0000 17
3394 1.0 1.0000 17
3396 1.0 1.0000 17
3397 0.0 1.0000 17
3398 2.0 1.0000 17
3400 1.0 0.6727 17
3401 2.0 1.0000 17
3402 0.0 1.0000 17
3406 0.0 0.6819 17
3407 1.0 1.0000 17
3411 0.0 1.0000 17
3412 1.0 1.0000 17
3413 1.0 0.7023 17
No. of records with gender 0 in cluster 17 is 5
No. of records with gender 1 in cluster 17 is 9
No. of records with gender 2 in cluster 17 is 3
Records found in cluster 18 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
3744 0.0 0.6440 18
3927 0.0 1.0000 18
3994 1.0 1.0000 18
4057 2.0 0.3516 18
4300 2.0 0.6736 18
4398 1.0 1.0000 18
4470 2.0 0.6602 18
4544 0.0 1.0000 18
4640 2.0 1.0000 18
4800 2.0 0.6575 18
4883 2.0 1.0000 18
5043 1.0 1.0000 18
5238 1.0 1.0000 18
5325 1.0 0.6645 18
5515 2.0 1.0000 18
5659 1.0 1.0000 18
5978 2.0 1.0000 18
6188 2.0 0.6748 18
6440 2.0 1.0000 18
6562 0.0 1.0000 18
6671 2.0 1.0000 18
6749 1.0 1.0000 18
6826 2.0 0.6933 18
7050 0.0 0.6736 18
No. of records with gender 0 in cluster 18 is 5
No. of records with gender 1 in cluster 18 is 7
No. of records with gender 2 in cluster 18 is 12
Records found in cluster 19 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
3878 0.0 0.6691 19
4606 0.0 1.0000 19
4627 2.0 1.0000 19
4690 0.0 0.6763 19
4712 0.0 1.0000 19
... ... ... ...
9294 0.0 1.0000 19
9313 2.0 0.6841 19
11175 0.0 1.0000 19
13999 0.0 0.6649 19
18789 0.0 1.0000 19
[275 rows x 3 columns]
No. of records with gender 0 in cluster 19 is 81
No. of records with gender 1 in cluster 19 is 63
No. of records with gender 2 in cluster 19 is 131
Records found in cluster 20 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4012 1.0 1.0000 20
4097 0.0 0.6706 20
4100 2.0 1.0000 20
4177 0.0 0.6729 20
4219 0.0 1.0000 20
... ... ... ...
5777 2.0 0.6638 20
5809 0.0 1.0000 20
5849 0.0 0.6792 20
5881 2.0 1.0000 20
5910 0.0 0.6787 20
[112 rows x 3 columns]
No. of records with gender 0 in cluster 20 is 40
No. of records with gender 1 in cluster 20 is 27
No. of records with gender 2 in cluster 20 is 45
Records found in cluster 21 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4146 0.0 1.0000 21
5546 1.0 1.0000 21
5644 1.0 0.6725 21
6374 2.0 1.0000 21
6391 1.0 1.0000 21
6688 2.0 1.0000 21
6772 2.0 1.0000 21
6814 1.0 1.0000 21
6882 0.0 0.6879 21
6904 2.0 0.6842 21
7745 1.0 1.0000 21
8159 2.0 1.0000 21
8331 2.0 0.6716 21
8340 2.0 0.6707 21
8487 0.0 0.6806 21
8505 1.0 1.0000 21
8622 0.0 0.6634 21
8690 2.0 1.0000 21
8764 2.0 0.6674 21
8784 2.0 1.0000 21
8834 2.0 1.0000 21
8859 2.0 1.0000 21
8971 1.0 1.0000 21
9055 1.0 1.0000 21
No. of records with gender 0 in cluster 21 is 4
No. of records with gender 1 in cluster 21 is 8
No. of records with gender 2 in cluster 21 is 12
Records found in cluster 22 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4224 0.0 1.0000 22
4319 0.0 1.0000 22
4392 0.0 0.6567 22
4506 0.0 1.0000 22
4558 2.0 0.6866 22
... ... ... ...
9151 1.0 0.6453 22
9194 2.0 1.0000 22
9195 1.0 1.0000 22
9220 2.0 1.0000 22
9283 2.0 0.6659 22
[97 rows x 3 columns]
No. of records with gender 0 in cluster 22 is 23
No. of records with gender 1 in cluster 22 is 28
No. of records with gender 2 in cluster 22 is 46
Records found in cluster 23 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4510 2.0 1.0000 23
4657 2.0 0.6751 23
4674 2.0 1.0000 23
4826 1.0 0.6887 23
5007 0.0 1.0000 23
5094 1.0 1.0000 23
5192 2.0 0.6835 23
5471 0.0 1.0000 23
5561 0.0 1.0000 23
5572 1.0 1.0000 23
5598 0.0 1.0000 23
5807 1.0 1.0000 23
5877 1.0 1.0000 23
6063 2.0 0.6930 23
6082 2.0 1.0000 23
6476 0.0 1.0000 23
6505 2.0 1.0000 23
6599 2.0 1.0000 23
6884 2.0 1.0000 23
6983 2.0 1.0000 23
7497 0.0 0.6799 23
7508 0.0 1.0000 23
7509 1.0 1.0000 23
7593 2.0 1.0000 23
7596 0.0 1.0000 23
7652 0.0 0.6772 23
7760 2.0 1.0000 23
7966 0.0 0.6607 23
8050 2.0 1.0000 23
8203 2.0 1.0000 23
8269 0.0 0.6774 23
8313 0.0 1.0000 23
8353 2.0 0.6650 23
8412 1.0 0.6900 23
8478 0.0 1.0000 23
8525 0.0 1.0000 23
8528 1.0 1.0000 23
8531 2.0 0.6681 23
8586 0.0 0.6453 23
8645 2.0 0.6778 23
8699 0.0 1.0000 23
8711 2.0 1.0000 23
8739 0.0 1.0000 23
8849 0.0 0.6906 23
8865 1.0 1.0000 23
8886 2.0 0.3536 23
8923 2.0 1.0000 23
8948 2.0 1.0000 23
8997 0.0 1.0000 23
9056 2.0 1.0000 23
9125 2.0 1.0000 23
9190 1.0 1.0000 23
9293 0.0 1.0000 23
No. of records with gender 0 in cluster 23 is 20
No. of records with gender 1 in cluster 23 is 10
No. of records with gender 2 in cluster 23 is 23
Records found in cluster 24 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4572 1.0 1.0000 24
4746 1.0 1.0000 24
4970 0.0 1.0000 24
4997 2.0 0.6957 24
5002 1.0 1.0000 24
5069 2.0 0.6832 24
5153 2.0 0.6735 24
5156 2.0 0.6516 24
5200 0.0 1.0000 24
5227 1.0 1.0000 24
5265 1.0 1.0000 24
5319 1.0 1.0000 24
5328 0.0 1.0000 24
5348 0.0 1.0000 24
5351 0.0 1.0000 24
5401 2.0 0.6836 24
5470 2.0 1.0000 24
5511 1.0 1.0000 24
5616 2.0 1.0000 24
5625 0.0 1.0000 24
5632 2.0 0.6651 24
5674 1.0 1.0000 24
5712 1.0 1.0000 24
5793 2.0 0.6675 24
5846 2.0 1.0000 24
5883 2.0 0.6725 24
5904 0.0 1.0000 24
5954 0.0 1.0000 24
5973 2.0 0.6509 24
6071 2.0 0.6524 24
6102 0.0 0.6699 24
6228 0.0 0.6636 24
6293 0.0 1.0000 24
6309 1.0 0.3750 24
6327 2.0 0.6733 24
6400 2.0 1.0000 24
6403 2.0 0.6663 24
6577 2.0 1.0000 24
6579 2.0 0.6762 24
6633 0.0 1.0000 24
6670 0.0 1.0000 24
6758 0.0 0.3469 24
7258 1.0 0.6902 24
7532 2.0 1.0000 24
7681 1.0 1.0000 24
7703 1.0 1.0000 24
7882 0.0 1.0000 24
18803 1.0 1.0000 24
No. of records with gender 0 in cluster 24 is 15
No. of records with gender 1 in cluster 24 is 14
No. of records with gender 2 in cluster 24 is 19
Records found in cluster 25 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4595 1.0 1.0000 25
4621 1.0 1.0000 25
4685 2.0 1.0000 25
4780 2.0 1.0000 25
4866 2.0 1.0000 25
... ... ... ...
12284 1.0 1.0000 25
12397 0.0 1.0000 25
12507 2.0 1.0000 25
12659 2.0 1.0000 25
12754 2.0 0.6615 25
[134 rows x 3 columns]
No. of records with gender 0 in cluster 25 is 30
No. of records with gender 1 in cluster 25 is 38
No. of records with gender 2 in cluster 25 is 66
Records found in cluster 26 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
6903 2.0 1.0000 26
7336 1.0 0.6624 26
7531 2.0 1.0000 26
7620 1.0 0.6549 26
8113 2.0 0.6675 26
8116 2.0 0.6611 26
8178 2.0 1.0000 26
8204 2.0 0.6746 26
8272 2.0 1.0000 26
8338 0.0 1.0000 26
8356 1.0 0.6517 26
8402 2.0 0.6767 26
8520 2.0 0.6820 26
8546 1.0 1.0000 26
8580 2.0 1.0000 26
8679 2.0 1.0000 26
8688 2.0 0.3354 26
8732 2.0 0.6946 26
8783 2.0 1.0000 26
8854 0.0 1.0000 26
8940 2.0 0.6815 26
8954 2.0 1.0000 26
8965 2.0 1.0000 26
9123 2.0 1.0000 26
9130 2.0 0.6741 26
9207 2.0 1.0000 26
9212 0.0 1.0000 26
9217 2.0 0.3376 26
9228 0.0 1.0000 26
9323 1.0 1.0000 26
No. of records with gender 0 in cluster 26 is 4
No. of records with gender 1 in cluster 26 is 5
No. of records with gender 2 in cluster 26 is 21
Records found in cluster 27 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7289 0.0 1.0000 27
12796 1.0 1.0000 27
13303 1.0 1.0000 27
13417 1.0 1.0000 27
13502 1.0 1.0000 27
13716 1.0 0.6830 27
13901 2.0 0.6611 27
14140 0.0 0.6645 27
14214 2.0 1.0000 27
14269 2.0 0.6868 27
14337 1.0 1.0000 27
14412 1.0 1.0000 27
14483 0.0 1.0000 27
14645 1.0 1.0000 27
15443 2.0 1.0000 27
15534 0.0 1.0000 27
15807 0.0 1.0000 27
15916 1.0 1.0000 27
16188 1.0 1.0000 27
16418 2.0 1.0000 27
16672 1.0 1.0000 27
16725 1.0 1.0000 27
17269 0.0 1.0000 27
17351 1.0 0.6556 27
17442 1.0 1.0000 27
17842 0.0 1.0000 27
18412 2.0 0.6690 27
18510 1.0 1.0000 27
18731 1.0 1.0000 27
18738 2.0 1.0000 27
No. of records with gender 0 in cluster 27 is 7
No. of records with gender 1 in cluster 27 is 16
No. of records with gender 2 in cluster 27 is 7
Records found in cluster 28 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7381 2.0 1.0000 28
7470 1.0 0.6810 28
7542 0.0 1.0000 28
7616 2.0 0.6675 28
7675 2.0 1.0000 28
7744 2.0 0.6761 28
7795 1.0 0.6602 28
7871 2.0 1.0000 28
7946 1.0 1.0000 28
8010 1.0 1.0000 28
8069 1.0 1.0000 28
8125 1.0 1.0000 28
8180 1.0 0.6850 28
8253 2.0 1.0000 28
8395 1.0 1.0000 28
8477 1.0 1.0000 28
8532 1.0 1.0000 28
8587 2.0 1.0000 28
8657 1.0 1.0000 28
8755 0.0 0.6707 28
8810 0.0 1.0000 28
8906 1.0 0.7047 28
8977 1.0 1.0000 28
9039 1.0 1.0000 28
9101 0.0 0.3496 28
9172 0.0 1.0000 28
9247 2.0 0.6622 28
9317 0.0 1.0000 28
17122 2.0 0.6583 28
No. of records with gender 0 in cluster 28 is 6
No. of records with gender 1 in cluster 28 is 14
No. of records with gender 2 in cluster 28 is 9
Records found in cluster 29 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7434 2.0 1.0000 29
7662 0.0 1.0000 29
7811 2.0 0.6341 29
7910 2.0 1.0000 29
8401 0.0 0.6732 29
8489 0.0 1.0000 29
8535 2.0 1.0000 29
8583 0.0 1.0000 29
8623 2.0 0.6778 29
8647 2.0 1.0000 29
8925 0.0 1.0000 29
8930 2.0 1.0000 29
9001 1.0 1.0000 29
9076 2.0 1.0000 29
9089 1.0 1.0000 29
9118 2.0 0.6712 29
9166 2.0 1.0000 29
9280 1.0 1.0000 29
No. of records with gender 0 in cluster 29 is 5
No. of records with gender 1 in cluster 29 is 3
No. of records with gender 2 in cluster 29 is 10
Records classified as noise
gender gender:confidence Cluster_Label
941 2.0 0.6582 -1
1367 1.0 1.0000 -1
2135 2.0 1.0000 -1
2382 1.0 1.0000 -1
2897 2.0 1.0000 -1
... ... ... ...
18272 0.0 0.6686 -1
18399 0.0 1.0000 -1
18527 1.0 1.0000 -1
18646 0.0 1.0000 -1
18759 0.0 0.6386 -1
[128 rows x 3 columns]
==================================================
EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES
==================================================
Data with Only Numerical and Categorical Features
<class 'pandas.core.frame.DataFrame'>
Index: 19970 entries, 0 to 18833
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 retweets_per_day 18836 non-null float64
1 favorites_per_day 18836 non-null float64
2 tweets_per_day 18836 non-null float64
3 profile_created_year 18836 non-null float64
4 tweet_created_year 18836 non-null float64
5 tweet_location_encoded 18836 non-null float64
6 user_timezone_encoded 18836 non-null float64
7 gender 18836 non-null float64
8 gender:confidence 18836 non-null float64
dtypes: float64(9)
memory usage: 1.5 MB
None
Removing NaN values...
Dropping gender and gender:confidence...
Dataset for Exp 2
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 retweets_per_day 17702 non-null float64
1 favorites_per_day 17702 non-null float64
2 tweets_per_day 17702 non-null float64
3 profile_created_year 17702 non-null float64
4 tweet_created_year 17702 non-null float64
5 tweet_location_encoded 17702 non-null float64
6 user_timezone_encoded 17702 non-null float64
dtypes: float64(7)
memory usage: 1.1 MB
None
retweets_per_day favorites_per_day tweets_per_day profile_created_year \
0 -0.100504 -0.318862 1.467473 0.497680
1 -0.100504 -0.313380 -0.582881 0.028171
2 9.949874 0.438028 -0.593854 0.967189
3 -0.100504 -0.306100 -0.691861 -1.380358
4 -0.100504 3.133544 -0.075028 0.967189
tweet_created_year tweet_location_encoded user_timezone_encoded
0 0.0 0.000053 0.001699
1 0.0 0.363294 0.127309
2 0.0 0.000053 0.002071
3 0.0 0.000159 0.105755
4 0.0 0.363294 0.381344
Applying UMAP for dim reduction...
[I 2024-09-19 17:49:07,472] A new study created in memory with name: no-name-ededf1db-d8b9-424d-a4b0-6fb01985e602
(17702, 3) Performing K-Means Clustering...
[I 2024-09-19 17:49:11,001] Trial 0 finished with value: 0.3505964259796568 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.3505964259796568.
[I 2024-09-19 17:49:14,567] Trial 1 finished with value: 0.35147759105495685 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 1 with value: 0.35147759105495685.
[I 2024-09-19 17:49:18,508] Trial 2 finished with value: 0.37002444364194625 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:22,259] Trial 3 finished with value: 0.35071003394577077 and parameters: {'n_clusters': 8, 'init': 'random'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:26,124] Trial 4 finished with value: 0.35071003394577077 and parameters: {'n_clusters': 8, 'init': 'random'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:29,770] Trial 5 finished with value: 0.27190640524695253 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:33,550] Trial 6 finished with value: 0.35279956412054736 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:37,429] Trial 7 finished with value: 0.4024977317416951 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 7 with value: 0.4024977317416951.
[I 2024-09-19 17:49:41,117] Trial 8 finished with value: 0.4278633474913973 and parameters: {'n_clusters': 5, 'init': 'random'}. Best is trial 8 with value: 0.4278633474913973.
[I 2024-09-19 17:49:44,821] Trial 9 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:48,424] Trial 10 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:52,022] Trial 11 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:55,618] Trial 12 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:59,260] Trial 13 finished with value: 0.33270855690372014 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:50:02,878] Trial 14 finished with value: 0.4274023684329269 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
Best params: {'n_clusters': 6, 'init': 'k-means++'}
[I 2024-09-19 17:50:07,251] A new study created in memory with name: no-name-195f08bf-a772-410f-b5da-70f8ab50a69d
The Silhouette score is 0.4280682750014589
The Callinski index is 8777.448715016866
Dataset with Labels from KMeans in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 4
3 0.0 1.0000 1
4 1.0 1.0000 2
Records found in cluster 0 from KMeans in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
7 0.0 1.0000 0
8 1.0 1.0000 0
11 2.0 1.0000 0
... ... ... ...
18828 1.0 0.3460 0
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18835 0.0 0.6772 0
[9067 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 2483
No. of records with gender 1 in cluster 0 is 3062
No. of records with gender 2 in cluster 0 is 3522
Records found in cluster 1 from KMeans in Exp 2
gender gender:confidence Cluster_Label
3 0.0 1.0000 1
5 1.0 1.0000 1
6 2.0 1.0000 1
9 1.0 1.0000 1
10 2.0 0.7002 1
... ... ... ...
18811 2.0 1.0000 1
18813 0.0 1.0000 1
18814 0.0 1.0000 1
18817 2.0 0.6579 1
18821 1.0 1.0000 1
[6958 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 2818
No. of records with gender 1 in cluster 1 is 2591
No. of records with gender 2 in cluster 1 is 1549
Records found in cluster 2 from KMeans in Exp 2
gender gender:confidence Cluster_Label
4 1.0 1.0000 2
62 1.0 1.0000 2
87 1.0 1.0000 2
101 0.0 1.0000 2
106 1.0 1.0000 2
... ... ... ...
18683 1.0 1.0000 2
18696 1.0 0.6644 2
18788 2.0 0.3429 2
18807 0.0 1.0000 2
18834 1.0 1.0000 2
[712 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 263
No. of records with gender 1 in cluster 2 is 333
No. of records with gender 2 in cluster 2 is 116
Records found in cluster 3 from KMeans in Exp 2
gender gender:confidence Cluster_Label
13 2.0 1.0000 3
34 2.0 1.0000 3
59 2.0 0.6694 3
65 0.0 0.6539 3
69 1.0 0.6738 3
... ... ... ...
18659 1.0 1.0000 3
18661 2.0 1.0000 3
18680 0.0 1.0000 3
18693 1.0 0.6553 3
18763 2.0 1.0000 3
[557 rows x 3 columns]
No. of records with gender 0 in cluster 3 is 128
No. of records with gender 1 in cluster 3 is 102
No. of records with gender 2 in cluster 3 is 327
Records found in cluster 4 from KMeans in Exp 2
gender gender:confidence Cluster_Label
2 0.0 0.6625 4
286 2.0 1.0000 4
392 2.0 0.6576 4
429 1.0 1.0000 4
633 2.0 1.0000 4
... ... ... ...
18071 2.0 1.0000 4
18108 0.0 1.0000 4
18502 2.0 1.0000 4
18516 2.0 1.0000 4
18649 0.0 1.0000 4
[162 rows x 3 columns]
No. of records with gender 0 in cluster 4 is 57
No. of records with gender 1 in cluster 4 is 42
No. of records with gender 2 in cluster 4 is 63
Records found in cluster 5 from KMeans in Exp 2
gender gender:confidence Cluster_Label
257 1.0 1.0000 5
306 1.0 1.0000 5
308 1.0 0.6752 5
1540 0.0 1.0000 5
1622 1.0 1.0000 5
... ... ... ...
18407 2.0 1.0000 5
18720 0.0 1.0000 5
18765 1.0 1.0000 5
18784 2.0 1.0000 5
18796 0.0 0.6760 5
[246 rows x 3 columns]
No. of records with gender 0 in cluster 5 is 94
No. of records with gender 1 in cluster 5 is 71
No. of records with gender 2 in cluster 5 is 81
Performing DBSCAN Clustering...
[I 2024-09-19 17:50:17,407] Trial 0 finished with value: 0.7435771107611636 and parameters: {'eps': 1.6875605206990094, 'min_samples': 9}. Best is trial 0 with value: 0.7435771107611636.
[I 2024-09-19 17:50:27,746] Trial 1 finished with value: 0.755250571224431 and parameters: {'eps': 1.9630802278917843, 'min_samples': 11}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:34,083] Trial 2 finished with value: 0.3476584528395313 and parameters: {'eps': 0.3823838274275766, 'min_samples': 7}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:42,247] Trial 3 finished with value: 0.517654092876453 and parameters: {'eps': 0.7134123414087624, 'min_samples': 12}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:51,986] Trial 4 finished with value: 0.7205824902178928 and parameters: {'eps': 1.4568916045764555, 'min_samples': 19}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:56,780] Trial 5 finished with value: -0.18883320225862038 and parameters: {'eps': 0.19229954783245615, 'min_samples': 19}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:06,671] Trial 6 finished with value: 0.7543665225775109 and parameters: {'eps': 1.761574322432801, 'min_samples': 7}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:12,117] Trial 7 finished with value: -0.04795422555050775 and parameters: {'eps': 0.2792160603274616, 'min_samples': 17}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:21,561] Trial 8 finished with value: 0.5299892203558405 and parameters: {'eps': 1.3258984479955693, 'min_samples': 5}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:25,994] Trial 9 finished with value: -0.4879587554963607 and parameters: {'eps': 0.11341970084515758, 'min_samples': 12}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:36,042] Trial 10 finished with value: 0.7543453657820123 and parameters: {'eps': 1.9639453920910408, 'min_samples': 14}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:46,203] Trial 11 finished with value: 0.7632826665730776 and parameters: {'eps': 1.9652082616592303, 'min_samples': 4}. Best is trial 11 with value: 0.7632826665730776.
[I 2024-09-19 17:51:56,339] Trial 12 finished with value: 0.7695151102549678 and parameters: {'eps': 1.9976508214248325, 'min_samples': 3}. Best is trial 12 with value: 0.7695151102549678.
[I 2024-09-19 17:52:05,529] Trial 13 finished with value: 0.5035289768021715 and parameters: {'eps': 1.0518221952011264, 'min_samples': 3}. Best is trial 12 with value: 0.7695151102549678.
[I 2024-09-19 17:52:15,252] Trial 14 finished with value: 0.7060846213740725 and parameters: {'eps': 1.4972565559855149, 'min_samples': 3}. Best is trial 12 with value: 0.7695151102549678.
Found best params: {'eps': 1.9976508214248325, 'min_samples': 3}
The Silhouette score is 0.7695151102549678
The Callinski index is 182.57715113799554
Dataset with Labels from DBSCAN in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from DBSCAN in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[17690 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5836
No. of records with gender 1 in cluster 0 is 6198
No. of records with gender 2 in cluster 0 is 5656
Records classified as noise
gender gender:confidence Cluster_Label
2502 0.0 0.6785 -1
3301 0.0 1.0000 -1
5613 1.0 1.0000 -1
6722 1.0 1.0000 -1
7666 2.0 1.0000 -1
10926 0.0 0.6513 -1
12504 0.0 1.0000 -1
12668 0.0 1.0000 -1
13331 1.0 1.0000 -1
15940 0.0 1.0000 -1
17960 0.0 1.0000 -1
18763 2.0 1.0000 -1
==================================================
EXP 3: USING ONLY TEXT FEATURES
==================================================
Dataset for Exp 3
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Columns: 3000 entries, desc_0 to text_1499
dtypes: float64(3000)
memory usage: 405.3 MB
None
desc_0 desc_1 desc_2 desc_3 desc_4 desc_5 desc_6 desc_7 desc_8 \
0 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
desc_9 ... text_1490 text_1491 text_1492 text_1493 text_1494 \
0 0.0 ... -0.142855 0.0 0.0 0.0 0.0
1 0.0 ... -0.142855 0.0 0.0 0.0 0.0
2 0.0 ... -0.142855 0.0 0.0 0.0 0.0
3 0.0 ... -0.142855 0.0 0.0 0.0 0.0
4 0.0 ... -0.142855 0.0 0.0 0.0 0.0
text_1495 text_1496 text_1497 text_1498 text_1499
0 -0.142733 -0.100504 0.0 0.0 0.0
1 -0.142733 -0.100504 0.0 0.0 0.0
2 -0.142733 -0.100504 0.0 0.0 0.0
3 -0.142733 -0.100504 0.0 0.0 0.0
4 -0.142733 -0.100504 0.0 0.0 0.0
[5 rows x 3000 columns]
Applying UMAP for dim reduction...
[I 2024-09-19 17:55:22,898] A new study created in memory with name: no-name-909a06f1-95c6-488e-b35e-549634c3f8ed
Performing K-Means Clustering...
[I 2024-09-19 17:55:28,056] Trial 0 finished with value: 0.3474574387073517 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.3474574387073517.
[I 2024-09-19 17:55:33,433] Trial 1 finished with value: 0.7000453472137451 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 1 with value: 0.7000453472137451.
[I 2024-09-19 17:55:38,578] Trial 2 finished with value: 0.41475844383239746 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 1 with value: 0.7000453472137451.
[I 2024-09-19 17:55:43,540] Trial 3 finished with value: 0.34986206889152527 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 1 with value: 0.7000453472137451.
[I 2024-09-19 17:55:48,953] Trial 4 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:55:53,991] Trial 5 finished with value: 0.32672926783561707 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:55:59,151] Trial 6 finished with value: 0.3999803364276886 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:04,084] Trial 7 finished with value: 0.35995355248451233 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:09,149] Trial 8 finished with value: 0.4177650809288025 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:14,677] Trial 9 finished with value: 0.7177287340164185 and parameters: {'n_clusters': 3, 'init': 'random'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:20,202] Trial 10 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:25,524] Trial 11 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:30,964] Trial 12 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:36,305] Trial 13 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:41,355] Trial 14 finished with value: 0.3999803364276886 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
Best params: {'n_clusters': 2, 'init': 'k-means++'}
The Silhouette score is 0.7358670234680176
The Callinski index is 7837.85693359375
Dataset with Labels from KMeans in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from KMeans in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[16759 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5545
No. of records with gender 1 in cluster 0 is 5925
No. of records with gender 2 in cluster 0 is 5289
Records found in cluster 1 from KMeans in Exp 3
[I 2024-09-19 17:56:48,430] A new study created in memory with name: no-name-4c26d09f-cf9c-40a1-b2bd-9e5ad70b1499
gender gender:confidence Cluster_Label 230 1.0 0.6755 1 264 0.0 1.0000 1 282 1.0 1.0000 1 431 0.0 0.6631 1 502 0.0 1.0000 1 ... ... ... ... 18609 1.0 1.0000 1 18646 0.0 1.0000 1 18759 0.0 0.6386 1 18789 0.0 1.0000 1 18803 1.0 1.0000 1 [943 rows x 3 columns] No. of records with gender 0 in cluster 1 is 298 No. of records with gender 1 in cluster 1 is 276 No. of records with gender 2 in cluster 1 is 369 Performing DBSCAN Clustering...
[I 2024-09-19 17:56:54,626] Trial 0 finished with value: 0.39434999227523804 and parameters: {'eps': 0.9001288568044092, 'min_samples': 9}. Best is trial 0 with value: 0.39434999227523804.
[I 2024-09-19 17:57:01,530] Trial 1 finished with value: 0.6305991411209106 and parameters: {'eps': 1.6892734071090372, 'min_samples': 17}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:07,375] Trial 2 finished with value: 0.4701833426952362 and parameters: {'eps': 0.7833320007467396, 'min_samples': 19}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:12,834] Trial 3 finished with value: 0.08904338628053665 and parameters: {'eps': 0.20500303205257642, 'min_samples': 14}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:18,723] Trial 4 finished with value: 0.500934362411499 and parameters: {'eps': 0.7954866903955463, 'min_samples': 13}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:25,434] Trial 5 finished with value: 0.6164038777351379 and parameters: {'eps': 1.560253475900042, 'min_samples': 16}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:31,698] Trial 6 finished with value: 0.5461991429328918 and parameters: {'eps': 1.1036293500557977, 'min_samples': 4}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:37,306] Trial 7 finished with value: 0.42453014850616455 and parameters: {'eps': 0.41359736485670484, 'min_samples': 15}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:43,423] Trial 8 finished with value: 0.5508587956428528 and parameters: {'eps': 0.9630808694485326, 'min_samples': 20}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:49,415] Trial 9 finished with value: 0.476026713848114 and parameters: {'eps': 0.8700905877932096, 'min_samples': 17}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:56,401] Trial 10 finished with value: 0.6314875483512878 and parameters: {'eps': 1.8598038150118554, 'min_samples': 10}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:03,379] Trial 11 finished with value: 0.631306529045105 and parameters: {'eps': 1.9086572095048915, 'min_samples': 9}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:10,505] Trial 12 finished with value: 0.6309650540351868 and parameters: {'eps': 1.9971290436396727, 'min_samples': 9}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:17,431] Trial 13 finished with value: 0.6314875483512878 and parameters: {'eps': 1.844756920307563, 'min_samples': 9}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:24,052] Trial 14 finished with value: 0.5369197726249695 and parameters: {'eps': 1.3935522371671678, 'min_samples': 5}. Best is trial 10 with value: 0.6314875483512878.
Found best params: {'eps': 1.8598038150118554, 'min_samples': 10}
The Silhouette score is 0.6314875483512878
The Callinski index is 1509.1162109375
Dataset with Labels from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[15997 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5341
No. of records with gender 1 in cluster 0 is 5683
No. of records with gender 2 in cluster 0 is 4973
Records found in cluster 1 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
42 2.0 1.0000 1
190 2.0 0.6780 1
252 2.0 1.0000 1
255 1.0 1.0000 1
328 1.0 1.0000 1
382 2.0 0.6753 1
836 1.0 1.0000 1
838 2.0 0.6857 1
980 2.0 1.0000 1
1011 1.0 1.0000 1
1102 1.0 0.6777 1
1276 2.0 1.0000 1
1439 2.0 1.0000 1
1690 2.0 1.0000 1
1702 0.0 1.0000 1
1814 1.0 0.3467 1
1923 0.0 1.0000 1
1938 2.0 1.0000 1
1943 1.0 1.0000 1
2062 0.0 1.0000 1
2141 1.0 1.0000 1
2159 1.0 1.0000 1
2392 2.0 1.0000 1
2420 0.0 1.0000 1
2591 0.0 0.6706 1
2660 0.0 0.3478 1
2856 0.0 1.0000 1
2893 2.0 1.0000 1
2973 0.0 0.6839 1
3034 0.0 0.6673 1
3147 1.0 1.0000 1
3184 0.0 0.6763 1
3326 0.0 1.0000 1
3384 0.0 0.6872 1
3487 1.0 1.0000 1
3799 1.0 1.0000 1
4237 1.0 1.0000 1
4502 0.0 1.0000 1
4550 2.0 1.0000 1
4704 0.0 0.6655 1
4850 0.0 1.0000 1
4913 1.0 1.0000 1
5023 0.0 1.0000 1
5139 2.0 0.6684 1
5727 2.0 1.0000 1
6326 2.0 0.6690 1
6339 0.0 1.0000 1
6525 2.0 0.6797 1
6813 2.0 1.0000 1
6874 0.0 1.0000 1
7582 2.0 0.6667 1
7867 0.0 1.0000 1
7995 2.0 1.0000 1
8096 2.0 1.0000 1
8262 2.0 1.0000 1
8347 2.0 1.0000 1
8459 0.0 0.6652 1
9204 1.0 1.0000 1
9211 0.0 1.0000 1
9264 1.0 1.0000 1
No. of records with gender 0 in cluster 1 is 21
No. of records with gender 1 in cluster 1 is 16
No. of records with gender 2 in cluster 1 is 23
Records found in cluster 2 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
211 2.0 1.0000 2
1594 2.0 0.6983 2
3203 0.0 1.0000 2
4433 1.0 1.0000 2
9376 0.0 1.0000 2
... ... ... ...
18546 1.0 1.0000 2
18573 0.0 1.0000 2
18584 1.0 1.0000 2
18624 1.0 1.0000 2
18656 1.0 1.0000 2
[88 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 23
No. of records with gender 1 in cluster 2 is 36
No. of records with gender 2 in cluster 2 is 29
Records found in cluster 3 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
230 1.0 0.6755 3
264 0.0 1.0000 3
1582 1.0 1.0000 3
3133 0.0 1.0000 3
3292 0.0 0.6814 3
3301 0.0 1.0000 3
3484 0.0 1.0000 3
4219 0.0 1.0000 3
4224 0.0 1.0000 3
4226 2.0 1.0000 3
4253 2.0 1.0000 3
4269 0.0 1.0000 3
4283 1.0 1.0000 3
4298 1.0 0.6539 3
4319 0.0 1.0000 3
4344 2.0 1.0000 3
4356 2.0 0.6709 3
4367 0.0 1.0000 3
4370 0.0 1.0000 3
4381 1.0 0.6719 3
4392 0.0 0.6567 3
4396 0.0 1.0000 3
4426 2.0 0.6838 3
4432 0.0 1.0000 3
4440 0.0 1.0000 3
4444 0.0 0.6422 3
4457 1.0 1.0000 3
4489 1.0 1.0000 3
4506 0.0 1.0000 3
4510 2.0 1.0000 3
4536 2.0 1.0000 3
4558 2.0 0.6866 3
4560 0.0 1.0000 3
4572 1.0 1.0000 3
4584 0.0 1.0000 3
4590 2.0 1.0000 3
4595 1.0 1.0000 3
10146 0.0 0.6757 3
10314 1.0 1.0000 3
10582 2.0 0.6383 3
10622 1.0 0.6692 3
11062 0.0 1.0000 3
11175 0.0 1.0000 3
11817 2.0 1.0000 3
12671 1.0 1.0000 3
12711 0.0 0.6667 3
12771 1.0 0.6677 3
13288 1.0 1.0000 3
13556 2.0 0.6581 3
13780 2.0 1.0000 3
14213 2.0 0.6692 3
14346 1.0 0.6710 3
14750 1.0 1.0000 3
15743 0.0 1.0000 3
15816 1.0 1.0000 3
17504 0.0 0.6567 3
18083 0.0 1.0000 3
18609 1.0 1.0000 3
No. of records with gender 0 in cluster 3 is 26
No. of records with gender 1 in cluster 3 is 18
No. of records with gender 2 in cluster 3 is 14
Records found in cluster 4 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
282 1.0 1.0000 4
502 0.0 1.0000 4
578 1.0 1.0000 4
644 0.0 1.0000 4
771 0.0 1.0000 4
963 2.0 1.0000 4
1433 1.0 1.0000 4
1881 0.0 0.6691 4
2762 2.0 0.6670 4
2903 1.0 0.6763 4
2929 0.0 1.0000 4
3229 0.0 1.0000 4
3308 0.0 0.3364 4
3353 0.0 1.0000 4
3681 2.0 1.0000 4
3770 0.0 1.0000 4
3830 0.0 1.0000 4
4305 1.0 1.0000 4
5040 0.0 1.0000 4
5367 0.0 1.0000 4
5479 0.0 0.6857 4
5634 2.0 0.6840 4
5742 0.0 1.0000 4
6460 2.0 1.0000 4
6862 1.0 1.0000 4
8397 2.0 0.6634 4
8516 2.0 0.6839 4
8918 2.0 1.0000 4
No. of records with gender 0 in cluster 4 is 14
No. of records with gender 1 in cluster 4 is 6
No. of records with gender 2 in cluster 4 is 8
Records found in cluster 5 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
431 0.0 0.6631 5
3276 1.0 1.0000 5
4374 2.0 1.0000 5
4456 1.0 1.0000 5
4653 2.0 1.0000 5
4995 2.0 1.0000 5
5220 2.0 0.6650 5
5372 2.0 1.0000 5
5749 2.0 1.0000 5
6043 2.0 0.6787 5
6172 2.0 1.0000 5
6208 1.0 0.6543 5
6496 2.0 0.6716 5
6669 0.0 1.0000 5
7060 1.0 0.6890 5
7261 0.0 1.0000 5
7439 0.0 1.0000 5
7683 1.0 0.6699 5
7902 0.0 1.0000 5
8120 1.0 1.0000 5
8360 2.0 0.6854 5
8408 0.0 1.0000 5
9100 0.0 1.0000 5
9333 1.0 1.0000 5
10448 2.0 0.6544 5
10820 0.0 0.6635 5
11056 1.0 1.0000 5
12961 1.0 1.0000 5
13252 1.0 1.0000 5
14102 0.0 1.0000 5
14844 0.0 1.0000 5
15017 1.0 1.0000 5
No. of records with gender 0 in cluster 5 is 10
No. of records with gender 1 in cluster 5 is 11
No. of records with gender 2 in cluster 5 is 11
Records found in cluster 6 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
444 0.0 1.0000 6
1112 2.0 1.0000 6
5901 2.0 1.0000 6
5902 0.0 0.6462 6
5904 0.0 1.0000 6
5910 0.0 0.6787 6
5914 2.0 1.0000 6
5930 0.0 0.6512 6
5932 0.0 1.0000 6
5934 2.0 1.0000 6
5935 2.0 1.0000 6
5936 2.0 0.6836 6
5945 2.0 1.0000 6
5952 0.0 1.0000 6
5954 0.0 1.0000 6
5956 2.0 1.0000 6
5961 2.0 1.0000 6
5962 1.0 1.0000 6
5963 0.0 1.0000 6
5964 1.0 1.0000 6
5965 2.0 0.6764 6
5966 2.0 0.6842 6
5973 2.0 0.6509 6
5986 0.0 1.0000 6
5989 2.0 1.0000 6
5990 0.0 0.6713 6
10357 1.0 1.0000 6
11202 0.0 1.0000 6
13236 0.0 1.0000 6
13487 0.0 1.0000 6
14898 1.0 1.0000 6
15100 0.0 0.6715 6
15296 0.0 1.0000 6
16532 1.0 1.0000 6
16536 0.0 0.6770 6
17155 1.0 1.0000 6
No. of records with gender 0 in cluster 6 is 17
No. of records with gender 1 in cluster 6 is 6
No. of records with gender 2 in cluster 6 is 13
Records found in cluster 7 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
575 0.0 1.0000 7
1308 0.0 0.6479 7
2033 1.0 1.0000 7
2308 1.0 0.6774 7
3898 0.0 1.0000 7
5454 2.0 0.6774 7
5539 1.0 1.0000 7
5628 2.0 1.0000 7
5825 1.0 1.0000 7
5847 2.0 0.6717 7
6012 0.0 1.0000 7
6048 2.0 0.6796 7
6114 1.0 0.6620 7
6335 2.0 1.0000 7
6382 2.0 0.6842 7
6417 2.0 1.0000 7
7843 2.0 1.0000 7
8181 0.0 1.0000 7
8355 2.0 0.6778 7
8738 0.0 1.0000 7
No. of records with gender 0 in cluster 7 is 6
No. of records with gender 1 in cluster 7 is 5
No. of records with gender 2 in cluster 7 is 9
Records found in cluster 8 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
805 2.0 1.0000 8
1520 0.0 1.0000 8
2701 0.0 1.0000 8
4906 2.0 0.6681 8
4908 0.0 1.0000 8
4909 2.0 1.0000 8
4910 0.0 1.0000 8
4912 1.0 1.0000 8
4917 1.0 0.6571 8
4918 0.0 1.0000 8
4923 2.0 1.0000 8
4924 2.0 0.6585 8
4929 1.0 1.0000 8
4934 1.0 0.6571 8
4937 2.0 1.0000 8
4944 1.0 0.6711 8
4949 2.0 1.0000 8
4950 1.0 1.0000 8
4951 1.0 1.0000 8
4961 0.0 1.0000 8
4962 1.0 1.0000 8
4965 2.0 0.6695 8
4967 0.0 1.0000 8
4968 1.0 1.0000 8
4970 0.0 1.0000 8
4973 1.0 1.0000 8
4990 1.0 1.0000 8
4997 2.0 0.6957 8
4999 2.0 0.6884 8
8976 1.0 1.0000 8
15005 0.0 1.0000 8
15181 0.0 0.6875 8
18661 2.0 1.0000 8
No. of records with gender 0 in cluster 8 is 10
No. of records with gender 1 in cluster 8 is 12
No. of records with gender 2 in cluster 8 is 11
Records found in cluster 9 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1203 1.0 1.0000 9
1240 1.0 0.6889 9
2115 0.0 1.0000 9
2381 0.0 1.0000 9
3988 2.0 1.0000 9
5994 2.0 0.6611 9
7988 1.0 0.6734 9
8071 1.0 1.0000 9
10735 0.0 1.0000 9
10738 0.0 1.0000 9
11076 2.0 1.0000 9
11179 2.0 1.0000 9
11484 1.0 1.0000 9
11648 1.0 1.0000 9
11746 0.0 1.0000 9
12054 1.0 1.0000 9
13078 0.0 1.0000 9
14056 2.0 1.0000 9
15064 0.0 0.6534 9
15751 1.0 1.0000 9
15757 1.0 1.0000 9
16465 0.0 1.0000 9
16868 1.0 1.0000 9
17448 0.0 1.0000 9
18208 0.0 1.0000 9
18753 0.0 0.6678 9
No. of records with gender 0 in cluster 9 is 11
No. of records with gender 1 in cluster 9 is 10
No. of records with gender 2 in cluster 9 is 5
Records found in cluster 10 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1273 0.0 1.0000 10
1605 2.0 1.0000 10
1761 2.0 1.0000 10
1845 1.0 1.0000 10
1987 1.0 1.0000 10
2274 0.0 1.0000 10
3961 0.0 1.0000 10
4092 0.0 0.3411 10
4424 2.0 1.0000 10
5218 2.0 1.0000 10
5336 1.0 1.0000 10
5445 0.0 1.0000 10
6262 2.0 1.0000 10
6289 1.0 1.0000 10
7003 1.0 1.0000 10
7118 2.0 1.0000 10
7431 1.0 1.0000 10
7540 0.0 0.6859 10
7791 1.0 1.0000 10
8142 2.0 1.0000 10
8601 2.0 0.6700 10
8693 0.0 1.0000 10
9023 1.0 0.6654 10
9265 1.0 1.0000 10
No. of records with gender 0 in cluster 10 is 7
No. of records with gender 1 in cluster 10 is 9
No. of records with gender 2 in cluster 10 is 8
Records found in cluster 11 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1301 2.0 1.0000 11
1666 0.0 1.0000 11
2010 0.0 1.0000 11
2876 2.0 0.6741 11
3238 1.0 1.0000 11
4359 1.0 1.0000 11
5200 0.0 1.0000 11
5203 1.0 1.0000 11
5205 1.0 0.6748 11
5209 1.0 1.0000 11
5211 0.0 0.6738 11
5217 0.0 1.0000 11
5227 1.0 1.0000 11
5232 1.0 1.0000 11
5234 1.0 1.0000 11
5242 1.0 1.0000 11
5256 2.0 0.6475 11
5262 0.0 0.6457 11
5264 0.0 1.0000 11
5265 1.0 1.0000 11
5266 0.0 1.0000 11
5270 2.0 1.0000 11
5271 2.0 0.6812 11
5272 2.0 1.0000 11
5284 1.0 0.6815 11
5289 0.0 1.0000 11
5291 2.0 0.6333 11
5297 0.0 1.0000 11
7900 2.0 1.0000 11
7908 2.0 1.0000 11
7910 2.0 1.0000 11
7914 2.0 1.0000 11
7933 1.0 1.0000 11
7953 0.0 1.0000 11
7956 1.0 1.0000 11
7958 1.0 1.0000 11
7959 0.0 0.6823 11
7963 2.0 1.0000 11
7964 2.0 1.0000 11
7966 0.0 0.6607 11
7967 2.0 0.6737 11
7968 2.0 1.0000 11
7973 0.0 1.0000 11
7975 0.0 1.0000 11
7976 0.0 1.0000 11
7977 2.0 0.6739 11
7980 2.0 1.0000 11
7987 0.0 1.0000 11
7991 1.0 1.0000 11
7999 2.0 0.6726 11
10908 0.0 1.0000 11
11615 2.0 1.0000 11
12253 1.0 1.0000 11
12766 2.0 0.3547 11
13202 2.0 1.0000 11
15562 0.0 1.0000 11
16542 1.0 1.0000 11
No. of records with gender 0 in cluster 11 is 19
No. of records with gender 1 in cluster 11 is 17
No. of records with gender 2 in cluster 11 is 21
Records found in cluster 12 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1303 1.0 1.0000 12
1365 1.0 1.0000 12
5694 2.0 1.0000 12
8923 2.0 1.0000 12
8925 0.0 1.0000 12
8927 1.0 1.0000 12
8930 2.0 1.0000 12
8940 2.0 0.6815 12
8943 2.0 1.0000 12
8944 2.0 0.6641 12
8945 0.0 1.0000 12
8947 2.0 1.0000 12
8948 2.0 1.0000 12
8951 0.0 0.6752 12
8952 1.0 0.6734 12
8953 1.0 1.0000 12
8954 2.0 1.0000 12
8965 2.0 1.0000 12
8971 1.0 1.0000 12
8981 1.0 1.0000 12
8987 2.0 1.0000 12
8988 0.0 1.0000 12
8989 1.0 1.0000 12
8990 2.0 1.0000 12
8991 2.0 0.6728 12
8995 2.0 0.6761 12
8997 0.0 1.0000 12
9395 1.0 1.0000 12
9792 2.0 0.6642 12
11130 1.0 1.0000 12
11659 0.0 1.0000 12
13220 2.0 1.0000 12
14625 0.0 1.0000 12
15940 0.0 1.0000 12
17978 2.0 1.0000 12
No. of records with gender 0 in cluster 12 is 8
No. of records with gender 1 in cluster 12 is 10
No. of records with gender 2 in cluster 12 is 17
Records found in cluster 13 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1458 2.0 1.0000 13
7860 2.0 0.6321 13
8313 0.0 1.0000 13
8322 1.0 1.0000 13
8327 0.0 0.6763 13
8331 2.0 0.6716 13
8333 2.0 1.0000 13
8337 1.0 1.0000 13
8338 0.0 1.0000 13
8339 0.0 1.0000 13
8340 2.0 0.6707 13
8341 1.0 0.6699 13
8353 2.0 0.6650 13
8356 1.0 0.6517 13
8358 2.0 0.6965 13
8384 0.0 1.0000 13
8385 1.0 1.0000 13
8391 0.0 1.0000 13
12693 0.0 1.0000 13
12899 2.0 1.0000 13
15029 1.0 1.0000 13
No. of records with gender 0 in cluster 13 is 7
No. of records with gender 1 in cluster 13 is 6
No. of records with gender 2 in cluster 13 is 8
Records found in cluster 14 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1474 1.0 0.3390 14
2740 0.0 1.0000 14
4804 2.0 0.6691 14
4811 0.0 1.0000 14
4817 0.0 0.3384 14
... ... ... ...
13842 1.0 1.0000 14
14718 0.0 1.0000 14
16342 1.0 1.0000 14
16883 1.0 1.0000 14
17182 0.0 1.0000 14
[123 rows x 3 columns]
No. of records with gender 0 in cluster 14 is 36
No. of records with gender 1 in cluster 14 is 29
No. of records with gender 2 in cluster 14 is 58
Records found in cluster 15 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1580 1.0 1.0000 15
9206 2.0 0.3398 15
9207 2.0 1.0000 15
9212 0.0 1.0000 15
9215 1.0 0.6818 15
9216 2.0 0.6519 15
9217 2.0 0.3376 15
9220 2.0 1.0000 15
9221 2.0 1.0000 15
9225 2.0 1.0000 15
9228 0.0 1.0000 15
9243 0.0 0.3506 15
9249 1.0 0.3542 15
9253 2.0 1.0000 15
9278 1.0 1.0000 15
9280 1.0 1.0000 15
9283 2.0 0.6659 15
9289 2.0 1.0000 15
9293 0.0 1.0000 15
9294 0.0 1.0000 15
9904 0.0 1.0000 15
No. of records with gender 0 in cluster 15 is 6
No. of records with gender 1 in cluster 15 is 5
No. of records with gender 2 in cluster 15 is 10
Records found in cluster 16 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1897 1.0 0.6483 16
8401 0.0 0.6732 16
8402 2.0 0.6767 16
8403 2.0 0.6575 16
8407 0.0 0.6763 16
8411 1.0 1.0000 16
8412 1.0 0.6900 16
8429 1.0 1.0000 16
8460 2.0 0.6828 16
8466 0.0 1.0000 16
8470 1.0 1.0000 16
8478 0.0 1.0000 16
8479 2.0 0.3625 16
8487 0.0 0.6806 16
8489 0.0 1.0000 16
8496 0.0 1.0000 16
12914 1.0 1.0000 16
No. of records with gender 0 in cluster 16 is 7
No. of records with gender 1 in cluster 16 is 6
No. of records with gender 2 in cluster 16 is 4
Records found in cluster 17 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1940 2.0 0.6675 17
7703 1.0 1.0000 17
7705 1.0 1.0000 17
7727 2.0 1.0000 17
7738 2.0 1.0000 17
7743 0.0 1.0000 17
7745 1.0 1.0000 17
7746 2.0 1.0000 17
7747 2.0 0.6745 17
7748 2.0 1.0000 17
7751 2.0 1.0000 17
7752 1.0 0.6649 17
7757 2.0 1.0000 17
7759 2.0 1.0000 17
7760 2.0 1.0000 17
7761 1.0 1.0000 17
7793 0.0 0.6691 17
7797 2.0 0.6600 17
No. of records with gender 0 in cluster 17 is 2
No. of records with gender 1 in cluster 17 is 5
No. of records with gender 2 in cluster 17 is 11
Records found in cluster 18 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2135 2.0 1.0000 18
3581 0.0 1.0000 18
3705 2.0 0.6581 18
3809 2.0 1.0000 18
3906 1.0 0.6422 18
... ... ... ...
18531 2.0 1.0000 18
18646 0.0 1.0000 18
18759 0.0 0.6386 18
18789 0.0 1.0000 18
18803 1.0 1.0000 18
[109 rows x 3 columns]
No. of records with gender 0 in cluster 18 is 38
No. of records with gender 1 in cluster 18 is 32
No. of records with gender 2 in cluster 18 is 39
Records found in cluster 19 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2138 1.0 1.0000 19
2145 0.0 1.0000 19
2146 1.0 1.0000 19
2147 1.0 1.0000 19
2148 1.0 0.3576 19
2156 0.0 1.0000 19
2166 1.0 1.0000 19
2168 0.0 0.6825 19
2169 1.0 1.0000 19
2171 1.0 1.0000 19
2172 0.0 1.0000 19
2182 2.0 1.0000 19
2185 0.0 1.0000 19
2186 0.0 0.3403 19
2187 1.0 1.0000 19
2188 2.0 0.6812 19
2189 0.0 0.6582 19
2191 0.0 1.0000 19
2194 1.0 1.0000 19
2196 1.0 1.0000 19
2204 1.0 0.6587 19
2205 0.0 0.6685 19
2206 1.0 0.6551 19
2207 1.0 1.0000 19
2210 1.0 1.0000 19
2216 1.0 0.6896 19
2217 1.0 0.6832 19
2220 1.0 1.0000 19
2223 2.0 1.0000 19
5916 2.0 0.6935 19
9793 0.0 0.6664 19
11047 1.0 1.0000 19
No. of records with gender 0 in cluster 19 is 10
No. of records with gender 1 in cluster 19 is 18
No. of records with gender 2 in cluster 19 is 4
Records found in cluster 20 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
3252 0.0 1.0000 20
8701 1.0 1.0000 20
8711 2.0 1.0000 20
8728 0.0 1.0000 20
8732 2.0 0.6946 20
8739 0.0 1.0000 20
8744 2.0 1.0000 20
8746 2.0 0.6916 20
8764 2.0 0.6674 20
8765 1.0 0.6611 20
8767 0.0 1.0000 20
8769 2.0 1.0000 20
8772 0.0 0.6732 20
8777 0.0 1.0000 20
8779 2.0 1.0000 20
8782 1.0 1.0000 20
8783 2.0 1.0000 20
8784 2.0 1.0000 20
11222 1.0 1.0000 20
16945 0.0 1.0000 20
No. of records with gender 0 in cluster 20 is 7
No. of records with gender 1 in cluster 20 is 4
No. of records with gender 2 in cluster 20 is 9
Records found in cluster 21 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
3316 0.0 1.0000 21
7600 0.0 1.0000 21
7601 2.0 0.6609 21
7611 0.0 0.6666 21
7613 2.0 1.0000 21
7614 2.0 0.6866 21
7615 2.0 1.0000 21
7620 1.0 0.6549 21
7621 1.0 1.0000 21
7622 2.0 1.0000 21
7626 0.0 1.0000 21
7627 0.0 0.7037 21
7629 2.0 1.0000 21
7652 0.0 0.6772 21
7655 1.0 1.0000 21
7662 0.0 1.0000 21
7665 2.0 0.6832 21
7667 0.0 1.0000 21
7669 2.0 1.0000 21
7670 1.0 1.0000 21
7672 2.0 1.0000 21
7679 1.0 1.0000 21
7680 1.0 1.0000 21
7681 1.0 1.0000 21
7686 2.0 1.0000 21
7694 2.0 1.0000 21
7697 1.0 1.0000 21
12196 0.0 0.7049 21
13766 1.0 1.0000 21
14354 0.0 1.0000 21
No. of records with gender 0 in cluster 21 is 10
No. of records with gender 1 in cluster 21 is 9
No. of records with gender 2 in cluster 21 is 11
Records found in cluster 22 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
3744 0.0 0.6440 22
3927 0.0 1.0000 22
3994 1.0 1.0000 22
4057 2.0 0.3516 22
4300 2.0 0.6736 22
... ... ... ...
12397 0.0 1.0000 22
12507 2.0 1.0000 22
12659 2.0 1.0000 22
12754 2.0 0.6615 22
14756 1.0 1.0000 22
[75 rows x 3 columns]
No. of records with gender 0 in cluster 22 is 18
No. of records with gender 1 in cluster 22 is 21
No. of records with gender 2 in cluster 22 is 36
Records found in cluster 23 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
4547 2.0 1.0000 23
7804 1.0 1.0000 23
7810 2.0 1.0000 23
7811 2.0 0.6341 23
7817 0.0 1.0000 23
7819 0.0 1.0000 23
7820 0.0 1.0000 23
7821 2.0 1.0000 23
7822 2.0 1.0000 23
7824 2.0 1.0000 23
7825 0.0 1.0000 23
7827 2.0 0.3472 23
7830 1.0 1.0000 23
7882 0.0 1.0000 23
7888 2.0 0.6506 23
7890 2.0 1.0000 23
7892 2.0 1.0000 23
7897 0.0 0.6803 23
7899 1.0 1.0000 23
8203 2.0 1.0000 23
8204 2.0 0.6746 23
8208 2.0 0.6844 23
8236 0.0 1.0000 23
8246 2.0 0.6598 23
8247 1.0 1.0000 23
8250 1.0 1.0000 23
8251 0.0 0.6624 23
8261 1.0 1.0000 23
8264 0.0 1.0000 23
8269 0.0 0.6774 23
8272 2.0 1.0000 23
8284 2.0 0.6691 23
No. of records with gender 0 in cluster 23 is 10
No. of records with gender 1 in cluster 23 is 6
No. of records with gender 2 in cluster 23 is 16
Records found in cluster 24 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
4606 0.0 1.0000 24
4608 0.0 0.6618 24
4615 2.0 0.6590 24
4621 1.0 1.0000 24
4627 2.0 1.0000 24
4643 0.0 1.0000 24
4657 2.0 0.6751 24
4664 1.0 1.0000 24
4674 2.0 1.0000 24
4675 2.0 1.0000 24
4685 2.0 1.0000 24
4690 0.0 0.6763 24
4691 0.0 1.0000 24
4710 2.0 1.0000 24
4712 0.0 1.0000 24
4717 2.0 1.0000 24
4720 2.0 1.0000 24
4722 2.0 0.6686 24
4731 1.0 1.0000 24
4743 2.0 1.0000 24
4746 1.0 1.0000 24
4772 2.0 1.0000 24
4778 1.0 0.3592 24
4780 2.0 1.0000 24
4781 2.0 0.6475 24
4782 1.0 0.6697 24
4783 2.0 1.0000 24
4785 2.0 0.6811 24
4789 2.0 1.0000 24
4790 1.0 1.0000 24
4798 2.0 0.6736 24
4799 0.0 1.0000 24
6627 2.0 1.0000 24
6629 1.0 1.0000 24
6633 0.0 1.0000 24
6650 2.0 1.0000 24
6654 1.0 1.0000 24
6660 2.0 1.0000 24
6664 2.0 1.0000 24
6665 1.0 1.0000 24
6668 0.0 1.0000 24
6670 0.0 1.0000 24
6678 1.0 1.0000 24
6685 2.0 1.0000 24
6688 2.0 1.0000 24
11370 2.0 1.0000 24
13222 2.0 1.0000 24
No. of records with gender 0 in cluster 24 is 10
No. of records with gender 1 in cluster 24 is 11
No. of records with gender 2 in cluster 24 is 26
Records found in cluster 25 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5013 1.0 1.0000 25
5567 2.0 1.0000 25
8109 1.0 1.0000 25
8112 0.0 1.0000 25
8113 2.0 0.6675 25
8116 2.0 0.6611 25
8118 1.0 1.0000 25
8122 2.0 0.6623 25
8123 2.0 0.6605 25
8128 0.0 1.0000 25
8132 2.0 0.6665 25
8146 1.0 1.0000 25
8159 2.0 1.0000 25
8165 0.0 1.0000 25
8176 1.0 1.0000 25
8177 2.0 1.0000 25
8178 2.0 1.0000 25
8185 2.0 1.0000 25
8190 2.0 0.6735 25
8191 1.0 0.3568 25
8192 2.0 0.6726 25
8199 2.0 1.0000 25
No. of records with gender 0 in cluster 25 is 3
No. of records with gender 1 in cluster 25 is 6
No. of records with gender 2 in cluster 25 is 13
Records found in cluster 26 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5206 1.0 1.0000 26
5629 2.0 1.0000 26
5640 0.0 1.0000 26
5944 1.0 1.0000 26
6093 1.0 0.6653 26
6157 2.0 0.6567 26
6174 2.0 0.6619 26
6409 0.0 1.0000 26
6514 1.0 1.0000 26
7289 0.0 1.0000 26
10812 1.0 0.6827 26
12073 1.0 1.0000 26
12796 1.0 1.0000 26
13106 1.0 0.6574 26
13303 1.0 1.0000 26
13417 1.0 1.0000 26
13502 1.0 1.0000 26
13716 1.0 0.6830 26
13901 2.0 0.6611 26
14140 0.0 0.6645 26
14214 2.0 1.0000 26
14269 2.0 0.6868 26
14337 1.0 1.0000 26
14412 1.0 1.0000 26
14483 0.0 1.0000 26
14645 1.0 1.0000 26
14855 2.0 1.0000 26
15443 2.0 1.0000 26
15534 0.0 1.0000 26
15807 0.0 1.0000 26
15916 1.0 1.0000 26
15950 2.0 1.0000 26
16188 1.0 1.0000 26
16418 2.0 1.0000 26
16672 1.0 1.0000 26
16725 1.0 1.0000 26
16854 2.0 1.0000 26
17269 0.0 1.0000 26
17351 1.0 0.6556 26
17442 1.0 1.0000 26
17842 0.0 1.0000 26
18302 1.0 1.0000 26
18412 2.0 0.6690 26
18510 1.0 1.0000 26
18731 1.0 1.0000 26
18738 2.0 1.0000 26
No. of records with gender 0 in cluster 26 is 9
No. of records with gender 1 in cluster 26 is 24
No. of records with gender 2 in cluster 26 is 13
Records found in cluster 27 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5400 0.0 1.0000 27
5401 2.0 0.6836 27
5407 2.0 0.6785 27
5408 2.0 1.0000 27
5409 0.0 1.0000 27
5412 2.0 1.0000 27
5427 1.0 1.0000 27
5429 0.0 1.0000 27
5433 2.0 0.6736 27
5434 1.0 1.0000 27
5436 2.0 0.6602 27
5442 1.0 0.3409 27
5443 2.0 0.6483 27
5447 1.0 1.0000 27
5448 2.0 0.6654 27
5449 1.0 1.0000 27
5456 0.0 1.0000 27
5457 2.0 0.6468 27
5466 2.0 1.0000 27
5470 2.0 1.0000 27
5471 0.0 1.0000 27
5472 0.0 1.0000 27
5480 1.0 1.0000 27
5485 2.0 1.0000 27
5486 1.0 1.0000 27
5487 1.0 0.6669 27
5490 2.0 1.0000 27
5491 2.0 1.0000 27
5635 2.0 1.0000 27
6074 0.0 1.0000 27
13614 1.0 1.0000 27
No. of records with gender 0 in cluster 27 is 7
No. of records with gender 1 in cluster 27 is 9
No. of records with gender 2 in cluster 27 is 15
Records found in cluster 28 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5506 2.0 0.6595 28
5511 1.0 1.0000 28
5524 0.0 0.6722 28
5541 0.0 1.0000 28
5542 2.0 1.0000 28
5544 1.0 0.3374 28
5546 1.0 1.0000 28
5552 2.0 1.0000 28
5558 2.0 1.0000 28
5559 2.0 0.6745 28
5560 1.0 1.0000 28
5561 0.0 1.0000 28
5563 2.0 1.0000 28
5564 2.0 1.0000 28
5566 1.0 0.6607 28
5570 2.0 1.0000 28
5572 1.0 1.0000 28
5579 1.0 1.0000 28
5583 2.0 1.0000 28
5588 0.0 0.6795 28
5597 0.0 1.0000 28
5598 0.0 1.0000 28
No. of records with gender 0 in cluster 28 is 6
No. of records with gender 1 in cluster 28 is 7
No. of records with gender 2 in cluster 28 is 9
Records found in cluster 29 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5605 1.0 1.0000 29
5611 2.0 0.6856 29
5616 2.0 1.0000 29
5625 0.0 1.0000 29
5626 2.0 0.6589 29
5632 2.0 0.6651 29
5643 0.0 1.0000 29
5644 1.0 0.6725 29
5661 2.0 1.0000 29
5665 1.0 1.0000 29
5669 2.0 1.0000 29
5670 1.0 0.6752 29
5671 2.0 0.3424 29
5672 2.0 1.0000 29
5673 0.0 0.6761 29
5674 1.0 1.0000 29
5675 2.0 1.0000 29
5679 2.0 0.6816 29
5681 1.0 1.0000 29
5683 2.0 1.0000 29
5685 0.0 1.0000 29
5686 2.0 1.0000 29
5687 2.0 0.6799 29
5689 2.0 0.6805 29
5696 0.0 1.0000 29
5697 0.0 0.6892 29
18237 1.0 1.0000 29
No. of records with gender 0 in cluster 29 is 6
No. of records with gender 1 in cluster 29 is 7
No. of records with gender 2 in cluster 29 is 14
Records found in cluster 30 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5705 1.0 1.0000 30
5709 2.0 0.6860 30
5711 2.0 1.0000 30
5712 1.0 1.0000 30
5726 2.0 0.6735 30
5746 2.0 0.3410 30
5752 2.0 0.6747 30
5754 1.0 1.0000 30
5757 1.0 1.0000 30
5766 2.0 1.0000 30
5767 2.0 1.0000 30
5768 1.0 0.3631 30
5770 2.0 1.0000 30
5773 2.0 0.6769 30
5777 2.0 0.6638 30
5782 1.0 1.0000 30
5786 2.0 1.0000 30
5790 0.0 1.0000 30
5792 2.0 1.0000 30
5793 2.0 0.6675 30
5794 2.0 1.0000 30
5798 2.0 1.0000 30
10884 0.0 0.6712 30
11215 2.0 1.0000 30
No. of records with gender 0 in cluster 30 is 2
No. of records with gender 1 in cluster 30 is 6
No. of records with gender 2 in cluster 30 is 16
Records found in cluster 31 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5800 1.0 1.0000 31
5807 1.0 1.0000 31
5809 0.0 1.0000 31
5810 1.0 1.0000 31
5819 2.0 0.6667 31
5835 2.0 1.0000 31
5838 2.0 1.0000 31
5841 2.0 0.6645 31
5843 0.0 0.6658 31
5846 2.0 1.0000 31
5849 0.0 0.6792 31
5861 2.0 0.6808 31
5862 0.0 1.0000 31
5868 1.0 1.0000 31
5869 1.0 1.0000 31
5870 0.0 0.3441 31
5877 1.0 1.0000 31
5881 2.0 1.0000 31
5883 2.0 0.6725 31
5885 2.0 0.6640 31
5894 1.0 1.0000 31
5898 2.0 0.6675 31
11156 1.0 1.0000 31
12450 1.0 1.0000 31
13833 0.0 0.6955 31
No. of records with gender 0 in cluster 31 is 6
No. of records with gender 1 in cluster 31 is 9
No. of records with gender 2 in cluster 31 is 10
Records found in cluster 32 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6101 1.0 0.6543 32
6102 0.0 0.6699 32
6103 0.0 1.0000 32
6109 0.0 1.0000 32
6129 2.0 0.6778 32
6131 0.0 1.0000 32
6133 0.0 0.6655 32
6134 0.0 1.0000 32
6147 2.0 0.6540 32
6149 0.0 1.0000 32
6151 2.0 0.6642 32
6156 2.0 1.0000 32
6158 1.0 1.0000 32
6164 1.0 1.0000 32
6167 2.0 0.6742 32
6169 2.0 0.6866 32
6178 1.0 1.0000 32
6180 1.0 1.0000 32
6190 0.0 1.0000 32
6192 2.0 0.6652 32
6197 1.0 0.6513 32
15331 0.0 0.6709 32
No. of records with gender 0 in cluster 32 is 9
No. of records with gender 1 in cluster 32 is 6
No. of records with gender 2 in cluster 32 is 7
Records found in cluster 33 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6301 2.0 1.0000 33
6302 2.0 1.0000 33
6309 1.0 0.3750 33
6311 2.0 1.0000 33
6318 1.0 1.0000 33
6319 0.0 0.6471 33
6327 2.0 0.6733 33
6332 0.0 1.0000 33
6358 2.0 0.6692 33
6366 2.0 0.6662 33
6373 2.0 1.0000 33
6374 2.0 1.0000 33
6378 0.0 1.0000 33
6381 1.0 1.0000 33
6383 2.0 0.6754 33
6389 0.0 1.0000 33
6390 1.0 1.0000 33
6391 1.0 1.0000 33
6393 2.0 1.0000 33
6397 1.0 1.0000 33
6398 2.0 1.0000 33
6399 1.0 1.0000 33
10210 2.0 0.6588 33
16505 1.0 1.0000 33
18762 1.0 1.0000 33
No. of records with gender 0 in cluster 33 is 4
No. of records with gender 1 in cluster 33 is 9
No. of records with gender 2 in cluster 33 is 12
Records found in cluster 34 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6502 0.0 1.0000 34
6505 2.0 1.0000 34
6516 0.0 1.0000 34
6521 2.0 1.0000 34
6523 1.0 1.0000 34
... ... ... ...
16234 2.0 0.6937 34
16385 1.0 1.0000 34
17421 0.0 1.0000 34
18026 0.0 1.0000 34
18443 2.0 1.0000 34
[70 rows x 3 columns]
No. of records with gender 0 in cluster 34 is 19
No. of records with gender 1 in cluster 34 is 16
No. of records with gender 2 in cluster 34 is 35
Records found in cluster 35 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6722 1.0 1.0000 35
6726 0.0 1.0000 35
6728 2.0 0.6634 35
6730 2.0 0.6681 35
6732 1.0 0.6882 35
6742 2.0 0.6625 35
6758 0.0 0.3469 35
6759 1.0 0.6543 35
6772 2.0 1.0000 35
6786 2.0 0.6694 35
6787 2.0 1.0000 35
6788 2.0 1.0000 35
6789 2.0 1.0000 35
6793 1.0 0.6699 35
6795 2.0 0.6741 35
14387 1.0 1.0000 35
16986 0.0 1.0000 35
No. of records with gender 0 in cluster 35 is 3
No. of records with gender 1 in cluster 35 is 5
No. of records with gender 2 in cluster 35 is 9
Records found in cluster 36 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7002 2.0 1.0000 36
7016 0.0 1.0000 36
7017 2.0 0.6646 36
7033 1.0 1.0000 36
7040 1.0 1.0000 36
7043 0.0 1.0000 36
7048 2.0 1.0000 36
7052 2.0 0.6595 36
7053 2.0 1.0000 36
7058 1.0 1.0000 36
7062 0.0 1.0000 36
7065 2.0 1.0000 36
7087 2.0 0.6671 36
7091 1.0 0.6642 36
7095 2.0 1.0000 36
7096 2.0 0.6782 36
7097 2.0 0.6788 36
No. of records with gender 0 in cluster 36 is 3
No. of records with gender 1 in cluster 36 is 4
No. of records with gender 2 in cluster 36 is 10
Records found in cluster 37 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7101 2.0 1.0000 37
7102 0.0 1.0000 37
7105 0.0 1.0000 37
7109 1.0 1.0000 37
7113 2.0 0.6718 37
7115 0.0 0.3451 37
7123 0.0 1.0000 37
7128 2.0 0.6585 37
7130 2.0 1.0000 37
7136 1.0 0.6835 37
7148 0.0 0.6750 37
7153 1.0 1.0000 37
7158 1.0 1.0000 37
7162 1.0 1.0000 37
7166 2.0 0.6635 37
7176 1.0 1.0000 37
7184 2.0 1.0000 37
No. of records with gender 0 in cluster 37 is 5
No. of records with gender 1 in cluster 37 is 6
No. of records with gender 2 in cluster 37 is 6
Records found in cluster 38 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7210 0.0 0.6617 38
7215 2.0 1.0000 38
7216 2.0 0.6921 38
7228 2.0 0.6766 38
7230 1.0 1.0000 38
7234 0.0 1.0000 38
7250 2.0 1.0000 38
7258 1.0 0.6902 38
7259 0.0 1.0000 38
7260 2.0 1.0000 38
7266 2.0 1.0000 38
7273 1.0 1.0000 38
7277 0.0 0.3487 38
7284 0.0 0.6661 38
7288 2.0 1.0000 38
7297 2.0 0.6853 38
No. of records with gender 0 in cluster 38 is 5
No. of records with gender 1 in cluster 38 is 3
No. of records with gender 2 in cluster 38 is 8
Records found in cluster 39 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7381 2.0 1.0000 39
7470 1.0 0.6810 39
7542 0.0 1.0000 39
7871 2.0 1.0000 39
7946 1.0 1.0000 39
8253 2.0 1.0000 39
8477 1.0 1.0000 39
8657 1.0 1.0000 39
8755 0.0 0.6707 39
8810 0.0 1.0000 39
9039 1.0 1.0000 39
9247 2.0 0.6622 39
9317 0.0 1.0000 39
No. of records with gender 0 in cluster 39 is 4
No. of records with gender 1 in cluster 39 is 5
No. of records with gender 2 in cluster 39 is 4
Records found in cluster 40 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7500 1.0 1.0000 40
7502 1.0 0.6617 40
7505 0.0 1.0000 40
7507 0.0 0.6848 40
7508 0.0 1.0000 40
7509 1.0 1.0000 40
7510 0.0 1.0000 40
7511 2.0 1.0000 40
7512 1.0 0.6739 40
7513 0.0 1.0000 40
7524 1.0 1.0000 40
7531 2.0 1.0000 40
7532 2.0 1.0000 40
7534 2.0 1.0000 40
7581 1.0 1.0000 40
7586 0.0 1.0000 40
7593 2.0 1.0000 40
7596 0.0 1.0000 40
7598 2.0 1.0000 40
No. of records with gender 0 in cluster 40 is 7
No. of records with gender 1 in cluster 40 is 6
No. of records with gender 2 in cluster 40 is 6
Records found in cluster 41 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7616 2.0 0.6675 41
7675 2.0 1.0000 41
7744 2.0 0.6761 41
7795 1.0 0.6602 41
8010 1.0 1.0000 41
8069 1.0 1.0000 41
8125 1.0 1.0000 41
8180 1.0 0.6850 41
8395 1.0 1.0000 41
8532 1.0 1.0000 41
8587 2.0 1.0000 41
8906 1.0 0.7047 41
8977 1.0 1.0000 41
9101 0.0 0.3496 41
9172 0.0 1.0000 41
10038 0.0 1.0000 41
17122 2.0 0.6583 41
No. of records with gender 0 in cluster 41 is 3
No. of records with gender 1 in cluster 41 is 9
No. of records with gender 2 in cluster 41 is 5
Records found in cluster 42 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8024 2.0 1.0000 42
8033 0.0 0.6701 42
8039 1.0 1.0000 42
8046 2.0 1.0000 42
8050 2.0 1.0000 42
8052 0.0 0.7050 42
8055 0.0 1.0000 42
8057 1.0 1.0000 42
8058 2.0 1.0000 42
8059 1.0 1.0000 42
8062 1.0 1.0000 42
8063 1.0 1.0000 42
8065 1.0 0.6688 42
8067 2.0 0.3442 42
8068 1.0 1.0000 42
8070 1.0 0.6698 42
8078 0.0 1.0000 42
8081 2.0 1.0000 42
8085 0.0 1.0000 42
8097 0.0 1.0000 42
16912 1.0 0.6483 42
No. of records with gender 0 in cluster 42 is 6
No. of records with gender 1 in cluster 42 is 9
No. of records with gender 2 in cluster 42 is 6
Records found in cluster 43 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8607 2.0 0.6659 43
8613 2.0 1.0000 43
8616 2.0 1.0000 43
8617 2.0 0.6774 43
8619 0.0 0.6647 43
8620 2.0 0.6975 43
8622 0.0 0.6634 43
8623 2.0 0.6778 43
8624 1.0 1.0000 43
8627 2.0 0.6829 43
8632 2.0 1.0000 43
8638 0.0 1.0000 43
8642 2.0 0.6688 43
8645 2.0 0.6778 43
8647 2.0 1.0000 43
8675 2.0 1.0000 43
8676 1.0 0.6602 43
8677 0.0 0.6772 43
8679 2.0 1.0000 43
8680 2.0 1.0000 43
8681 0.0 0.6507 43
8688 2.0 0.3354 43
8690 2.0 1.0000 43
8691 2.0 0.3595 43
8694 2.0 0.6736 43
8699 0.0 1.0000 43
13069 0.0 1.0000 43
13603 1.0 1.0000 43
15290 2.0 1.0000 43
17358 0.0 1.0000 43
No. of records with gender 0 in cluster 43 is 8
No. of records with gender 1 in cluster 43 is 3
No. of records with gender 2 in cluster 43 is 19
Records found in cluster 44 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8804 2.0 0.6561 44
8834 2.0 1.0000 44
8843 0.0 0.3571 44
8844 2.0 1.0000 44
8849 0.0 0.6906 44
8852 0.0 1.0000 44
8854 0.0 1.0000 44
8855 1.0 0.6440 44
8859 2.0 1.0000 44
8864 0.0 0.3421 44
8865 1.0 1.0000 44
8873 0.0 1.0000 44
8874 1.0 1.0000 44
8878 2.0 0.6640 44
8881 0.0 1.0000 44
8884 1.0 0.6612 44
8886 2.0 0.3536 44
17100 1.0 1.0000 44
No. of records with gender 0 in cluster 44 is 7
No. of records with gender 1 in cluster 44 is 5
No. of records with gender 2 in cluster 44 is 6
Records found in cluster 45 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9001 1.0 1.0000 45
9020 0.0 1.0000 45
9028 1.0 0.6849 45
9033 0.0 1.0000 45
9038 1.0 0.6667 45
9043 1.0 1.0000 45
9046 2.0 0.6745 45
9050 1.0 0.6658 45
9052 2.0 0.6826 45
9054 1.0 1.0000 45
9055 1.0 1.0000 45
9056 2.0 1.0000 45
9061 0.0 1.0000 45
9064 2.0 1.0000 45
9069 2.0 0.6595 45
9070 0.0 1.0000 45
9072 1.0 0.6774 45
9076 2.0 1.0000 45
9079 0.0 1.0000 45
9080 1.0 0.6532 45
9081 0.0 1.0000 45
9082 0.0 1.0000 45
9083 0.0 1.0000 45
9089 1.0 1.0000 45
12197 2.0 1.0000 45
13641 0.0 1.0000 45
No. of records with gender 0 in cluster 45 is 9
No. of records with gender 1 in cluster 45 is 10
No. of records with gender 2 in cluster 45 is 7
Records found in cluster 46 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9105 2.0 0.6468 46
9109 0.0 0.6553 46
9112 1.0 1.0000 46
9113 0.0 1.0000 46
9115 2.0 0.6771 46
9118 2.0 0.6712 46
9123 2.0 1.0000 46
9125 2.0 1.0000 46
9130 2.0 0.6741 46
9136 2.0 1.0000 46
9144 2.0 1.0000 46
9150 1.0 1.0000 46
9151 1.0 0.6453 46
9152 0.0 1.0000 46
9165 0.0 1.0000 46
9166 2.0 1.0000 46
9178 2.0 0.6698 46
9190 1.0 1.0000 46
9194 2.0 1.0000 46
9195 1.0 1.0000 46
9945 2.0 0.6779 46
No. of records with gender 0 in cluster 46 is 4
No. of records with gender 1 in cluster 46 is 5
No. of records with gender 2 in cluster 46 is 12
Records found in cluster 47 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9382 2.0 1.0000 47
9398 1.0 1.0000 47
9475 0.0 1.0000 47
9496 0.0 1.0000 47
9511 2.0 0.6634 47
... ... ... ...
15169 1.0 1.0000 47
15207 1.0 1.0000 47
15391 2.0 1.0000 47
15439 2.0 1.0000 47
15622 2.0 1.0000 47
[68 rows x 3 columns]
No. of records with gender 0 in cluster 47 is 18
No. of records with gender 1 in cluster 47 is 24
No. of records with gender 2 in cluster 47 is 26
Records found in cluster 48 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9648 0.0 1.0000 48
10111 2.0 1.0000 48
10551 2.0 0.6362 48
10903 1.0 1.0000 48
11265 1.0 1.0000 48
11650 0.0 1.0000 48
12295 0.0 1.0000 48
12731 2.0 1.0000 48
15770 0.0 0.6808 48
16201 2.0 1.0000 48
No. of records with gender 0 in cluster 48 is 4
No. of records with gender 1 in cluster 48 is 2
No. of records with gender 2 in cluster 48 is 4
Records found in cluster 49 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
11119 1.0 1.0000 49
11727 2.0 1.0000 49
12333 1.0 1.0000 49
12992 0.0 1.0000 49
13486 2.0 1.0000 49
14046 0.0 1.0000 49
14958 2.0 1.0000 49
15597 1.0 0.3362 49
16706 0.0 1.0000 49
17186 1.0 1.0000 49
17599 0.0 0.6654 49
18270 0.0 1.0000 49
No. of records with gender 0 in cluster 49 is 5
No. of records with gender 1 in cluster 49 is 4
No. of records with gender 2 in cluster 49 is 3
Records classified as noise
gender gender:confidence Cluster_Label
812 2.0 0.6678 -1
1367 1.0 1.0000 -1
1544 0.0 1.0000 -1
2154 1.0 0.6561 -1
2382 1.0 1.0000 -1
2481 0.0 1.0000 -1
2897 2.0 1.0000 -1
3283 2.0 1.0000 -1
3341 1.0 1.0000 -1
3526 1.0 1.0000 -1
3938 2.0 0.6545 -1
4051 2.0 1.0000 -1
4277 1.0 1.0000 -1
4650 2.0 0.3571 -1
5424 0.0 1.0000 -1
6140 2.0 0.6679 -1
6313 1.0 1.0000 -1
7107 2.0 0.6865 -1
7453 2.0 0.6782 -1
8798 1.0 1.0000 -1
8836 0.0 0.6645 -1
8905 1.0 1.0000 -1
14448 0.0 1.0000 -1
14613 0.0 1.0000 -1
14791 1.0 1.0000 -1
15015 1.0 1.0000 -1
15216 0.0 1.0000 -1
---- VISUALIZE THE METRIC EVALUATION ----
REGRESSION¶
In [3]:
# =============================== REGRESSION ======================================
print()
print()
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)
print()
print("=" * 50)
print('Boosted Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model
boosted_reg.fit(X_train, y_train)
# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# FEATURE IMPORTANCE
print()
print("Performing feature importance analysis...")
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
# print("desc_ column indices:", desc_columns)
# print("text_ column indices:", text_columns)
# print("desc_ array:\n", desc_array)
# print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
col_name = df_preprocessed_reg.columns[i]
new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)
# Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()
# preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset
# filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]
# plotting these columns
def scatterplot_mistaken_points(misclassified_df, X_train, model):
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Set the main title
fig.suptitle(f'{model}\nGender Confidence of "Mistaken" Records', fontsize=16)
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Training Set\nSample Size: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset')
axes[1].set_ylabel('Predicted')
axes[1].set_title(f'Not Training Set\nSample Size: {len(df_not_in_X_train)}')
plt.tight_layout()
plt.show()
def scatter_plot(y, y_tot_pred, model):
# Plotting more results results
plt.figure(figsize=(10, 8))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Predicted', fontsize=12)
plt.suptitle(model, fontsize=16)
plt.title('Gender Confidence Comparison', fontsize=14)
plt.show()
scatterplot_mistaken_points(misclassified_df, X_train, "Boosted Regression Tree with Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree with Vectorised Text/Desc Features")
# ==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
print(df_preprocessed_non_text)
print()
print("=" * 50)
print('Boosted Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)
boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# Get feature importances and plot from the model
print()
print("Performing feature importance analysis...")
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
'Feature': column_names,
'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()
# adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset
# Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text, "Boosted Regression Tree without Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree without Vectorised Text/Desc Features")
# ====================================Analyzing with a linear regression (Least Squares Implementation)====================
print()
print("=" * 50)
print('Linear Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
# run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree with Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
# identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree with Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree with Vectorised Text/Desc Features")
# ================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]
scatterplot_mistaken_points(common_df, X_train_lin, "Boosted and Linear Regression Trees (Intersection) with Vectorised Text/Desc Features")
================================================== Boosted Regression Tree with Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0266 Mean Squared Error (Test): 0.0290 Mean Squared Error (Total): 0.0280
Performing feature importance analysis...
desc text favorites_per_day retweets_per_day tweets_per_day \
0 0.307368 0.365717 0.021232 0.0 0.121167
profile_created_year tweet_created_year link_R link_G link_B \
0 0.155415 0.0 0.000336 0.011339 0.000434
sidebar_R sidebar_G sidebar_B
0 0.005375 0.006886 0.00473
favorites_per_day retweets_per_day tweets_per_day \
0 0.000000 0.000000 28.156306
1 0.015557 0.000000 1.709220
2 2.147921 0.000279 1.567681
3 0.036214 0.000000 0.303514
4 9.797322 0.000000 8.259911
... ... ... ...
18831 0.090636 0.000000 0.234994
18832 0.568938 0.000000 3.061580
18833 0.011366 0.000000 6.005683
18834 16.336871 0.000000 12.937933
18835 0.878510 0.000000 0.766728
profile_created_year tweet_created_year link_R link_G link_B \
0 2013 2015 8 194 194
1 2012 2015 0 132 180
2 2014 2015 171 184 194
3 2009 2015 0 132 180
4 2014 2015 59 148 217
... ... ... ... ... ...
18831 2015 2015 0 132 180
18832 2012 2015 207 185 41
18833 2012 2015 0 132 180
18834 2012 2015 146 102 204
18835 2014 2015 0 132 180
sidebar_R sidebar_G sidebar_B
0 255 255 255
1 192 222 237
2 192 222 237
3 192 222 237
4 0 0 0
... ... ... ...
18831 192 222 237
18832 0 0 0
18833 192 222 237
18834 0 0 0
18835 192 222 237
[18836 rows x 11 columns]
==================================================
Boosted Regression Tree without Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0274
Mean Squared Error (Test): 0.0291
Mean Squared Error (Total): 0.0280
Performing feature importance analysis...
================================================== Linear Regression Tree with Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0166 Mean Squared Error (Test): 0.0499 Mean Squared Error (Total): 0.0366
CLASSIFICATION¶
In [4]:
# ============================== CLASSIFICATION ==============================
print()
print()
print('---- CLASSIFICATION ----')
# Features and target
X = df_preprocessed.drop(columns=['gender']) # Assuming 'gender' is the target variable
y = df_preprocessed['gender']
# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(X_train, y_train)
# Predict on test data
y_pred_rf = rf_classifier.predict(X_test)
# Evaluate the performance
print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
# Train the model
xgb_model.fit(X_train, y_train)
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
# Evaluate the model
print("\nXGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)
# Fit the model
lgb_clf.fit(X_train, y_train)
# Predict
y_pred_lgb = lgb_clf.predict(X_test)
# Evaluation
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))
# Helper function to plot confusion matrix
def plot_confusion_matrix(y_test, y_pred, model_name):
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'{model_name} Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
# Helper function to extract and display classification report with model name
def get_classification_report(y_test, y_pred, model_name):
report = classification_report(y_test, y_pred, output_dict=True)
df = pd.DataFrame(report).transpose()
df['model'] = model_name
return df
# Random Forest Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
rf_report = get_classification_report(y_test, y_pred_rf, "Random Forest")
# XGBoost Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
xgb_report = get_classification_report(y_test, y_pred_xgb, "XGBoost")
# LightGBM Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_lgb, "LightGBM")
lgb_report = get_classification_report(y_test, y_pred_lgb, "LightGBM")
# Combine all reports
combined_report = pd.concat([rf_report, xgb_report, lgb_report])
# Debugging Step: Check the combined report structure
print("Combined Classification Report:\n", combined_report.head())
# Filter out rows for precision, recall, and f1-score
combined_report_filtered = combined_report[
combined_report.index.isin(['0', '1']) # Filter for the classes
].reset_index()
# Debugging Step: Check the filtered report structure
print("Filtered Report for Precision, Recall, and F1-Score:\n", combined_report_filtered.head())
# Plot Precision, Recall, and F1-Score for each model
metrics = ['precision', 'recall', 'f1-score']
for metric in metrics:
# Debugging Step: Filter for specific metric
print(f"Data for {metric}:")
print(combined_report_filtered[['index', metric, 'model']])
plt.figure(figsize=(10, 6))
sns.barplot(
x="index",
y=metric,
hue="model",
data=combined_report_filtered[['index', metric, 'model']]
)
plt.title(f'{metric.capitalize()} Comparison')
plt.ylabel(metric.capitalize())
plt.xlabel('Class (0 = Human, 1 = Non-Human)')
plt.show()
# Accuracy comparison
accuracies = {
'Random Forest': accuracy_score(y_test, y_pred_rf),
'XGBoost': accuracy_score(y_test, y_pred_xgb),
'LightGBM': accuracy_score(y_test, y_pred_lgb)
}
plt.figure(figsize=(6, 4))
plt.bar(accuracies.keys(), accuracies.values(), color=['blue', 'green', 'red'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()
---- CLASSIFICATION ----
Accuracy Score: 0.6239384288747346
Confusion Matrix:
[[665 469 133]
[289 929 100]
[251 175 757]]
Classification Report:
precision recall f1-score support
0 0.55 0.52 0.54 1267
1 0.59 0.70 0.64 1318
2 0.76 0.64 0.70 1183
accuracy 0.62 3768
macro avg 0.64 0.62 0.63 3768
weighted avg 0.63 0.62 0.62 3768
XGBoost Classifier Report:
precision recall f1-score support
0 0.56 0.54 0.55 1267
1 0.60 0.65 0.62 1318
2 0.72 0.67 0.69 1183
accuracy 0.62 3768
macro avg 0.62 0.62 0.62 3768
weighted avg 0.62 0.62 0.62 3768
Accuracy: 0.6191613588110403
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36890
[LightGBM] [Info] Number of data points in the train set: 15068, number of used features: 1766
[LightGBM] [Info] Start training from score -1.117843
[LightGBM] [Info] Start training from score -1.029513
[LightGBM] [Info] Start training from score -1.152536
LightGBM Classification Report:
precision recall f1-score support
0 0.57 0.55 0.56 1267
1 0.61 0.65 0.63 1318
2 0.72 0.69 0.70 1183
accuracy 0.63 3768
macro avg 0.63 0.63 0.63 3768
weighted avg 0.63 0.63 0.63 3768
Combined Classification Report:
precision recall f1-score support model
0 0.551867 0.524862 0.538026 1267.000000 Random Forest
1 0.590591 0.704856 0.642684 1318.000000 Random Forest
2 0.764646 0.639899 0.696733 1183.000000 Random Forest
accuracy 0.623938 0.623938 0.623938 0.623938 Random Forest
macro avg 0.635702 0.623205 0.625814 3768.000000 Random Forest
Filtered Report for Precision, Recall, and F1-Score:
index precision recall f1-score support model
0 0 0.551867 0.524862 0.538026 1267.0 Random Forest
1 1 0.590591 0.704856 0.642684 1318.0 Random Forest
2 0 0.557096 0.539069 0.547934 1267.0 XGBoost
3 1 0.596540 0.654021 0.623959 1318.0 XGBoost
4 0 0.568994 0.553275 0.561024 1267.0 LightGBM
Data for precision:
index precision model
0 0 0.551867 Random Forest
1 1 0.590591 Random Forest
2 0 0.557096 XGBoost
3 1 0.596540 XGBoost
4 0 0.568994 LightGBM
5 1 0.605674 LightGBM
Data for recall: index recall model 0 0 0.524862 Random Forest 1 1 0.704856 Random Forest 2 0 0.539069 XGBoost 3 1 0.654021 XGBoost 4 0 0.553275 LightGBM 5 1 0.647951 LightGBM
Data for f1-score: index f1-score model 0 0 0.538026 Random Forest 1 1 0.642684 Random Forest 2 0 0.547934 XGBoost 3 1 0.623959 XGBoost 4 0 0.561024 LightGBM 5 1 0.626100 LightGBM
ASSOCIATION RULES¶
In [10]:
# ============================== ASSOCIATION RULES ==============================
print()
print()
print('---- ASSOCIATION RULES ----')
# Binarize numeric columns
df_asso['high_favorites'] = df_asso['favorites_per_day'] > df_asso['favorites_per_day'].mean()
df_asso['high_retweets'] = df_asso['retweets_per_day'] > df_asso['retweets_per_day'].mean()
df_asso['high_tweets'] = df_asso['tweets_per_day'] > df_asso['tweets_per_day'].mean()
# Binarize year columns (profile_created_year and tweet_created_year)
# Example: Set threshold year as 2015
df_asso['profile_recent'] = df_asso['profile_created_year'] >= 2015
df_asso['tweet_recent'] = df_asso['tweet_created_year'] >= 2015
# Select only the binary columns
df_apriori = df_asso[['high_favorites', 'high_retweets', 'high_tweets',
'profile_recent', 'tweet_recent',
'tweet_location_encoded', 'user_timezone_encoded']]
# Convert all columns to int (0 or 1)
df_apriori = df_apriori.astype(int)
# Apply Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)
# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# Display the rules
print(rules)
top_frequent_itemsets = frequent_itemsets.nlargest(10, 'support')
plt.figure(figsize=(10, 6))
sns.barplot(x='support', y='itemsets', data=top_frequent_itemsets)
plt.title('Top 10 Frequent Itemsets by Support')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.show()
# ---------------------------
# Visualization 2: Scatter Plot of Association Rules by Confidence and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(x='confidence', y='lift', size='support', data=rules, hue='antecedents', palette='viridis', sizes=(40, 200))
plt.title('Association Rules: Confidence vs Lift')
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.legend(title='Antecedents', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
# ---------------------------
# Visualization 3: Heatmap of Support, Confidence, and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.heatmap(rules[['support', 'confidence', 'lift']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Support, Confidence, and Lift')
plt.show()
---- ASSOCIATION RULES ----
antecedents consequents \
0 (high_favorites) (high_tweets)
1 (high_tweets) (high_favorites)
2 (high_favorites) (tweet_recent)
3 (tweet_recent) (high_favorites)
4 (high_tweets) (tweet_recent)
5 (tweet_recent) (high_tweets)
6 (profile_recent) (tweet_recent)
7 (tweet_recent) (profile_recent)
8 (high_favorites, high_tweets) (tweet_recent)
9 (high_favorites, tweet_recent) (high_tweets)
10 (high_tweets, tweet_recent) (high_favorites)
11 (high_favorites) (high_tweets, tweet_recent)
12 (high_tweets) (high_favorites, tweet_recent)
13 (tweet_recent) (high_favorites, high_tweets)
antecedent support consequent support support confidence lift \
0 0.210607 0.271767 0.066097 0.313839 1.15481
1 0.271767 0.210607 0.066097 0.243212 1.15481
2 0.210607 1.000000 0.210607 1.000000 1.00000
3 1.000000 0.210607 0.210607 0.210607 1.00000
4 0.271767 1.000000 0.271767 1.000000 1.00000
5 1.000000 0.271767 0.271767 0.271767 1.00000
6 0.175568 1.000000 0.175568 1.000000 1.00000
7 1.000000 0.175568 0.175568 0.175568 1.00000
8 0.066097 1.000000 0.066097 1.000000 1.00000
9 0.210607 0.271767 0.066097 0.313839 1.15481
10 0.271767 0.210607 0.066097 0.243212 1.15481
11 0.210607 0.271767 0.066097 0.313839 1.15481
12 0.271767 0.210607 0.066097 0.243212 1.15481
13 1.000000 0.066097 0.066097 0.066097 1.00000
leverage conviction zhangs_metric
0 0.008861 1.061316 0.169823
1 0.008861 1.043082 0.184085
2 0.000000 inf 0.000000
3 0.000000 1.000000 0.000000
4 0.000000 inf 0.000000
5 0.000000 1.000000 0.000000
6 0.000000 inf 0.000000
7 0.000000 1.000000 0.000000
8 0.000000 inf 0.000000
9 0.008861 1.061316 0.169823
10 0.008861 1.043082 0.184085
11 0.008861 1.061316 0.169823
12 0.008861 1.043082 0.184085
13 0.000000 1.000000 0.000000